diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 9ce48658b1..505fe77f89 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,3 +1,10 @@
+<!--
+If you have a question rather than reporting a bug please go to https://forum.opencv.org where you get much faster responses.
+If you need further assistance please read [How To Contribute](https://github.com/opencv/opencv/wiki/How_to_contribute).
+
+This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
+-->
+
 ##### System information (version)
 <!-- Example
 - OpenCV => 4.2
@@ -26,17 +33,17 @@
 
  - [ ] I report the issue, it's not a question
    <!--
-   OpenCV team works with answers.opencv.org, Stack Overflow and other communities
+   OpenCV team works with forum.opencv.org, Stack Overflow and other communities
    to discuss problems. Tickets with question without real issue statement will be
    closed.
    -->
  - [ ] I checked the problem with documentation, FAQ, open issues,
-       answers.opencv.org, Stack Overflow, etc and have not found solution
+       forum.opencv.org, Stack Overflow, etc and have not found solution
    <!--
    Places to check:
    * OpenCV documentation: https://docs.opencv.org
    * FAQ page: https://github.com/opencv/opencv/wiki/FAQ
-   * OpenCV forum: https://answers.opencv.org
+   * OpenCV forum: https://forum.opencv.org
    * OpenCV issue tracker: https://github.com/opencv/opencv/issues?q=is%3Aissue
    * Stack Overflow branch: https://stackoverflow.com/questions/tagged/opencv
    -->
diff --git a/.github/workflows/arm64-build-checks.yml b/.github/workflows/arm64-build-checks.yml
index f5988c7895..d3cf532d59 100644
--- a/.github/workflows/arm64-build-checks.yml
+++ b/.github/workflows/arm64-build-checks.yml
@@ -1,6 +1,6 @@
 name: arm64 build checks
 
-on: [pull_request]
+on: workflow_dispatch
 
 jobs:
   build:
diff --git a/3rdparty/carotene/src/resize.cpp b/3rdparty/carotene/src/resize.cpp
index 49205573cd..aa5b756c75 100644
--- a/3rdparty/carotene/src/resize.cpp
+++ b/3rdparty/carotene/src/resize.cpp
@@ -758,7 +758,7 @@ inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize,
     }
     else if (channels == 3)
     {
-        if ((wr == 2.0f) && (wr == 2.0f))
+        if ((wr == 2.0f) && (hr == 2.0f))
         {
 #ifndef __ANDROID__
             size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
diff --git a/3rdparty/ffmpeg/ffmpeg.cmake b/3rdparty/ffmpeg/ffmpeg.cmake
index 8cf0f24f5e..3cd5e1be94 100644
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/master_20200908
-# Binaries were created for OpenCV: f445b826d084188077a5e9d204c4c33d1589f380
-ocv_update(FFMPEG_BINARIES_COMMIT "6152e132572dfdaa32887eabeb7199bef49b14dc")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "37e2dadf776631acc8856e281f29cf42")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "cf5dba83edf8619f57ccff4edb989c62")
+# Binaries branch name: ffmpeg/master_20210303
+# Binaries were created for OpenCV: 7ac6abe02a33bef445a5b77214ad31964e2c5cc1
+ocv_update(FFMPEG_BINARIES_COMMIT "629590c3ba09fb0c8eaa9ab858ff13d3a84ca1aa")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "638065d5a0dab8a828879942375dcac4")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "7f10ae2e6a080ba3714f7a38ee03ae15")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "f8e65dbe4a3b4eedc0d2997e07c3f3fd")
 
 function(download_win_ffmpeg script_var)
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 61e40b2885..2074888a52 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -239,6 +239,9 @@ if(HOST_BIG_ENDIAN)
 else()
   set(HOST_BIG_ENDIAN 0)
 endif()
+if(HOST_BIG_ENDIAN)
+  add_definitions(-DWORDS_BIGENDIAN)
+endif()
 
 # IEEE floating point
 set(HAVE_IEEEFP 1 CACHE STRING "IEEE floating point is available")
diff --git a/3rdparty/libtiff/ChangeLog b/3rdparty/libtiff/ChangeLog
index 1f50e20135..452dcb3a18 100644
--- a/3rdparty/libtiff/ChangeLog
+++ b/3rdparty/libtiff/ChangeLog
@@ -1,3 +1,2329 @@
+2020-12-19  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	* libtiff 4.2.0 released.
+
+	* configure.ac: Pass tar-ustar option to AM_INIT_AUTOMAKE rather
+	than tar-pax since ustar POSIX 1003.1-1988 format is more portable
+	than PAX POSIX 1003.1-2001 format.
+
+2020-12-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'w_adjust-deflate_names' into 'master'
+	Set 'deflate' to DEFLATE_NAMES, instead of 'libdeflate'
+
+	See merge request libtiff/libtiff!174
+
+2020-12-12  Lemures Lemniscati  <lemures.lemniscati@gmail.com>
+
+	Set 'deflate' to DEFLATE_NAMES, instead of 'libdeflate'
+	'lib' will be automatically added as a prefix while doing find_library()
+
+2020-12-12  Even Rouault  <even.rouault@spatialys.com>
+
+	DoubleToRational(): avoid casting NaN to uint32 (fixes #227)
+
+2020-12-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_221' into 'master'
+	tiffio.h: do not define __attribute__ but defines TIFF_ATTRIBUTE instead (fixes #221)
+
+	Closes #221
+
+	See merge request libtiff/libtiff!173
+
+2020-12-12  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffio.h: do not define __attribute__ but defines TIFF_ATTRIBUTE instead (fixes #221)
+
+2020-12-08  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadDirEntryArrayWithLimit(): properly read from offline tag value when we clamp the number of strips to 1.
+	Fixes regression of commit 7057734d986001b7fd6d2afde9667da7754ff2cc on reading
+	a file with StripByteCounts with 1 element (broken) and StripOffsets with
+	896 elements, and where StripOffsets[0] is correct
+
+	$ tiffdump foo.tif
+	Magic: 0x4949 <little-endian> Version: 0x2a <ClassicTIFF>
+	Directory 0: offset 25725448 (0x1888a08) next 0 (0)
+	SubFileType (254) LONG (4) 1<0>
+	ImageWidth (256) LONG (4) 1<640>
+	ImageLength (257) LONG (4) 1<20098>
+	BitsPerSample (258) SHORT (3) 1<16>
+	Photometric (262) SHORT (3) 1<1>
+	SamplesPerPixel (277) SHORT (3) 1<1>
+	ResolutionUnit (296) SHORT (3) 1<2>
+	StripByteCounts (279) LONG (4) 1<1806>
+	StripOffsets (273) LONG (4) 896<8 648 1288 1928 2568 3208 3848 4488 5128 5768 6408 7048 7688 8328 8968 9608 10248 10888 11528 12168 12808 13448 14088 14728 ...>
+
+2020-12-02  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: avoid potential harmless unsigned integer overflow on data->fileoffset in JPEGFixupTagsSubsamplingSkip() by validating earlier. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28200
+
+2020-11-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Jamaika1-master-patch-47839' into 'master'
+	Change ULARGE_INTEGER to LARGE_INTEGER
+
+	See merge request libtiff/libtiff!170
+
+2020-11-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Jamaika1-master-patch-46397' into 'master'
+	Added stdint.h
+
+	See merge request libtiff/libtiff!171
+
+2020-11-27  Jamaika  <lukaszcz18@wp.pl>
+
+	Added stdint.h.
+
+	``` tif_win32.c: In function '_tiffSizeProc': tif_win32.c:159:23: warning: passing argument 2 of 'GetFileSizeEx' from incompatible pointer type [-Wincompatible-pointer-types]   159 |  if (GetFileSizeEx(fd,&m))       |                       ^~       |                       |       |                       ULARGE_INTEGER * In file included from c:\msys1021\x86_64-w64-mingw32\include\winbase.h:18,                  from c:\msys1021\x86_64-w64-mingw32\include\windows.h:70,                  from tif_win32.c:32: c:\msys1021\x86_64-w64-mingw32\include\fileapi.h:78:73: note: expected 'PLARGE_INTEGER' {aka 'LARGE_INTEGER *'} but argument is of type 'ULARGE_INTEGER *'    78 |   WINBASEAPI WINBOOL WINAPI GetFileSizeEx (HANDLE hFile, PLARGE_INTEGER lpFileSize);       |                                                          ~~~~~~~~~~~~~~~^~~~~~~~~~ ```
+
+2020-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-113' into 'master'
+	tiffcrop: fix buffer overrun in extractContigSamples24bits()
+
+	Closes #113
+
+	See merge request libtiff/libtiff!169
+
+2020-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-156' into 'master'
+	tiff2pdf: Check output size before writing
+
+	Closes #156
+
+	See merge request libtiff/libtiff!168
+
+2020-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-201' into 'master'
+	tiff2pdf: enforce memory limit for tiled pictures too
+
+	Closes #201
+
+	See merge request libtiff/libtiff!167
+
+2020-11-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-207' into 'master'
+	enforce (configurable) memory limit in tiff2rgba
+
+	Closes #209 et #207
+
+	See merge request libtiff/libtiff!165
+
+2020-11-20  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: avoid false positive -Wnull-dereference of mingw32 gcc 7.3.
+
+2020-11-17  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop: fix buffer overrun in extractContigSamples24bits()
+	fixes #113
+
+	tiff2pdf: Check output size before writing.
+	fixes #156
+
+	tiff2pdf: enforce memory limit for tiled pictures too.
+	fixes #201
+
+2020-11-15  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2rgba.1: -M option.
+
+	enforce (configurable) memory limit in tiff2rgba.
+	fixes #207
+	fixes #209
+
+2020-11-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-220' into 'master'
+	tiff2pdf.c: properly calculate datasize when saving to JPEG YCbCr
+
+	Closes #220
+
+	See merge request libtiff/libtiff!159
+
+2020-11-14  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf.c: properly calculate datasize when saving to JPEG YCbCr.
+	fixes #220
+
+2020-11-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-204' into 'master'
+	avoid buffer overflow while writing jpeg end of file marker
+
+	Closes #204
+
+	See merge request libtiff/libtiff!161
+
+2020-11-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-193' into 'master'
+	fix buffer overflow in tiff2ps.c
+
+	Closes #193
+
+	See merge request libtiff/libtiff!162
+
+2020-11-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'skal65535-master-patch-91082' into 'master'
+	More overflow fixes for large widths
+
+	See merge request libtiff/libtiff!164
+
+2020-11-14  skal  <pascal.massimino@gmail.com>
+
+	More overflow fixes for large width.
+	Also: use INT_MAX instead of hard-coded constants.
+
+2020-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'skal65535-master-patch-56655' into 'master'
+	Fix potential overflow in gtStripContig()
+
+	See merge request libtiff/libtiff!163
+
+2020-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-211' into 'master'
+	check for tile width overflow
+
+	Closes #211
+
+	See merge request libtiff/libtiff!160
+
+2020-11-12  skal  <pascal.massimino@gmail.com>
+
+	Fix potential overflow in gtStripContig()
+	(w + w) might not fit in int32 if too large.
+
+2020-11-09  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps.c: fix buffer overread.
+	fixes #193
+
+	fix undefined behaviour (int shifted too much to the left)
+
+	avoid buffer overflow while writing jpeg end of file marker.
+	fixes #204
+
+	gtTileContig(): check Tile width for overflow.
+	fixes #211
+
+	fix warning messages (v32 is unsigned)
+
+2020-10-26  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFStartStrip(): avoid potential crash in WebP codec when using scanline access on corrupted files. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26650
+
+2020-10-20  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_webp.c: validate tile/strip dimension to avoid unsigned integer overflow in RGBA.size computation
+
+2020-10-19  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_zip.c: fix typo in comment.
+
+2020-10-16  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff.h: remove irrelevant warning about webp related pseudo-tags not being registered: they are purely internal libtiff concepts
+
+2020-10-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'libdeflate' into 'master'
+	Add support for building against libdeflate for faster Zip/Deflate compression/decompression
+
+	See merge request libtiff/libtiff!158
+
+2020-10-16  Even Rouault  <even.rouault@spatialys.com>
+
+	test: add testdeflatelaststripextradata.sh.
+
+2020-10-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Add support for optional building against libdeflate for faster Zip/Deflate compression/decompression.
+	So we can have 2 kind of builds with the Zip/Deflate codec:
+	- zlib only
+	- zlib + libdeflate
+
+	Speed improvements in the 35%-50% range can be expected when libdeflate is used.
+	Compression level up to 12 is now supported (capped to 9 when zlib is used).
+	Still requires zlib for situations where libdeflate cannot be used (that
+	is for scanline access, since libdeflate has no streaming mode)
+
+	Pseudo-tag TIFFTAG_DEFLATE_SUBCODEC=DEFLATE_SUBCODEC_ZLIB/DEFLATE_SUBCODEC_LIBDEFLATE
+	is added to control which subcodec (zlib or libdeflate) should be used (it defaults
+	of course to libdeflate, when it is available).
+	This is mostly aimed at being used on the writing side, to be able to reproduce
+	output of previous libtiff versions at a binary level, in situations where this would
+	be really needed. Or as a safety belt in case there would be unforeseen issues
+	with using libdeflate.
+	It can be used to know when libdeflate is available at runtime (DEFLATE_SUBCODEC_LIBDEFLATE
+	will be the default value in that situation).
+
+	Of course, deflate codestreams produced by libdeflate can be read by zlib, and vice-versa.
+
+2020-10-14  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_webp.c: fix compiler warnings with MSVC.
+
+2020-10-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'various_fixes' into 'master'
+	Fix compiler warnings about unused variables when assert() expands to nothing
+
+	See merge request libtiff/libtiff!157
+
+2020-10-12  Even Rouault  <even.rouault@spatialys.com>
+
+	.gitignore: add entries for new files in test/
+
+	Fix compiler warnings about unused variables when assert() expands to nothing
+
+2020-10-09  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch '215-cygwin-appveyor-fail' into 'master'
+	Update Appveyor CI build to build with VS2019 image
+
+	Closes #215
+
+	See merge request libtiff/libtiff!154
+
+2020-10-09  Roger Leigh  <rleigh@codelibre.net>
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+2020-10-09  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'TIFF-217_m_lib_path' into 'master'
+	cmake: Do not use absolute libm path
+
+	Closes #217
+
+	See merge request libtiff/libtiff!156
+
+2020-10-09  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Do not use absolute libm path.
+
+2020-10-08  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_fax3.h: restore systematic calls to CLEANUP_RUNS()
+	now that SETVALUE() no longer cause overflows.
+	Those were removed per b351db8be1b4d3f712bdb9424a79d3174cc03202 and
+	3440ac216463fcad170bbb391491e69730a59ffa.
+
+	As SETVALUE() now returns an error, this allow the decoder to exit.
+
+	Otherwise, the assert(x == lastx) in _TIFFFax3fillruns() can trigger.
+
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26201
+
+2020-10-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'check_TIFFFlushData1' into 'master'
+	FAX/JPEG/LZMA/PixarLog/ZIP/ZSTD codecs: make sure to check TIFFFlushData1() return value
+
+	See merge request libtiff/libtiff!155
+
+2020-10-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'shared-memory' into 'master'
+	Set the --shared-memory linker flag for Emscripten builds
+
+	See merge request libtiff/libtiff!153
+
+2020-10-03  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff2rgba.c: fix -Wold-style-declaration warning.
+
+	FAX/JPEG/LZMA/PixarLog/ZIP/ZSTD codecs: make sure to check TIFFFlushData1() return value
+
+2020-09-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_fax3.h: extra buffer overflow checks. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25934
+
+2020-09-25  Roger Leigh  <rleigh@codelibre.net>
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+	wip.
+
+	Update AppVeyor image.
+
+	test-appveyor.
+
+2020-09-24  Attila Oláh  <atl@google.com>
+
+	Also pass --shared-memory to raw_decode.
+	This is needed when building for Emscripten with *both* WEBP and JPEG
+	support.
+
+	Set the --shared-memory linker flag for Emscripten builds.
+	This is only needed when building with WEBP support, which uses atomics,
+	therefore the linker needs the --shared-memory flag. The flag cannot be
+	added globally because not all executables link against libwebp.
+
+2020-09-22  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_fax3.h: return error when a buffer overflow occurs. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25552 and https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25849
+
+2020-09-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-float-compare' into 'master'
+	Fix comparison for max negative float value.
+
+	See merge request libtiff/libtiff!152
+
+2020-09-11  Dirk Lemstra  <dirk@lemstra.org>
+
+	Fix comparison for max negative float value.
+
+2020-09-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Fax3PreDecode(): reset curruns and refruns state variables.
+	to avoid out-of-bounds write triggered by GDAL when repeatedly
+	reading a corrupt strip.
+
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25493
+
+2020-06-06  Thomas Bernard  <miniupnp@free.fr>
+
+	Merge branch 'issue-17' into 'master'
+	normalize tools behaviour regarding -h
+
+	Closes #17
+
+	See merge request libtiff/libtiff!115
+
+2020-05-31  Even Rouault  <even.rouault@spatialys.com>
+
+	TWebPSetupEncode(): fix logic problem (and instead of or) in test that checks input is 8bit unsigned data
+
+2020-05-12  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFGetConfiguredCODECs(): fix to avoid wrong structure to be returned for registered (ie non built-in) codecs
+
+2020-05-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'zstd-webp-update' into 'master'
+	gitlab-ci: use latest zstd and webp versions
+
+	See merge request libtiff/libtiff!148
+
+2020-05-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'deprecated' into 'master'
+	ojpeg: s/Depreciated/Deprecated/
+
+	See merge request libtiff/libtiff!149
+
+2020-05-09  Aaron Boxer  <boxerab@gmail.com>
+
+	ojpeg: s/Depreciated/Deprecated/
+
+2020-04-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix typos.
+
+	tif_jpeg.c: avoid potential division in previous fix (master only)
+
+2020-04-26  Thomas Bernard  <miniupnp@free.fr>
+
+	gitlab-ci: use latest zstd and webp versions.
+
+2020-04-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff.h: fixes to use ASCII only characters (master only)
+
+2020-04-26  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffsplit: use EXIT_SUCCESS / EXIT_FAILURE.
+
+	tiffset: print usage on stdout when -h is used.
+	also use EXIT_FAILURE / EXIT_SUCCESS
+	see #17
+
+	tiffmedian: shopw usage on stdout when -h is used.
+	aslo use EXIT_SUCCESS/EXIT_FAILURE
+	see #17
+
+	tiffinfo: print usage on stdout when -h is used.
+	also use EXIT_FAILURE / EXIT_SUCCESS
+	see #17
+
+	raw2tiff: print usage to stdout when -h is used.
+	see #17
+
+	tiff2pdf: print usage on stdout when -h is used.
+	see #17
+
+	tiffgt: output usage on stdout with -h.
+	also use EXIT_SUCCESS / EXIT_FAILURE
+
+	tiffdump: use EXIT_FAILURE / EXIT_SUCCESS.
+	see #17
+
+	tiffdither: print usage on stdout when -h is used.
+	see #17
+
+2020-04-26  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop: -h / -v prints usage/version to stdout.
+	also uses the standard C EXIT_SUCCESS / EXIT_FAILURE
+	macros
+
+	see #17
+
+2020-04-26  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcp: output usage to stdout when using -h.
+	also use EXIT_FAILURE / EXIT_SUCCESS
+	see #17
+
+	tiffcmp: match exit status for posix cmp and diff tools.
+
+	tiff2rgba: output usage to stdout when using -h.
+	also uses std C EXIT_FAILURE / EXIT_SUCCESS
+	see #17
+
+	tiff2ps: sue EXIT_FAILURE / EXIT_SUCCESS.
+	see #17
+
+	tiff2bw: output usage on stdout when using -h.
+	also uses EXIT_SUCCESS / EXIT_FAILURE
+	see #17
+
+	thumbnail: use EXIT_FAILURE / EXIT_SUCCESS.
+	the -h option was already used so it cannot be used for help/usage
+	see #17
+
+	rgb2ycbcr: use EXIT_FAILURE / EXIT_SUCCESS.
+	the -h option was already used so it cannot be used for help/usage
+	see #17
+
+	ppm2tiff: output usage to stdout when using -h option.
+	also uses std C EXIT_SUCCESS / EXIT_FAILURE
+	see #17
+
+	pal2rgb: output usage to stdout when -h is used.
+	see #17
+
+	fax2tiff.c: print usage on stdout when using -h option.
+	see #17
+
+	fax2ps: output usage to stdout when using -h option.
+	also use EXIT_SUCCESS, EXIT_FAILURE from C standard
+
+2020-04-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'jpeg_multiscan_dos_logic' into 'master'
+	tif_jpeg.c: revise logic to detect potential excessive memory usage when...
+
+	See merge request libtiff/libtiff!147
+
+2020-04-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-176' into 'master'
+	tiff2pdf: get rid of uninitialized memory content
+
+	Closes #176
+
+	See merge request libtiff/libtiff!143
+
+2020-04-24  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: revise logic to detect potential excessive memory usage when decoding multiscan JPEG compressed images
+
+2020-04-19  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf: test the return code of TIFFReadRawStrip() and TIFFReadRawTile()
+
+	tiff2pdf.c: fix some whitespace problems in source.
+
+	tiff2pdf: get rid of uninitialized memory content.
+	fixes #176
+
+2020-04-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-18' into 'master'
+	tiffset: pass size for TIFFTAG_INKNAMES
+
+	Closes #18
+
+	See merge request libtiff/libtiff!146
+
+2020-04-18  Olivier Paquet  <olivier.paquet@gmail.com>
+
+	Merge branch 'issue-80' into 'master'
+	tiffinfo: fix dump of Tiled images
+
+	Closes #80
+
+	See merge request libtiff/libtiff!144
+
+2020-04-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix wrong file size checks for memory-mapped BigTIFF files that could lead to image rejection
+
+2020-04-05  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffset: pass size for TIFFTAG_INKNAMES.
+	Uses TIFFFieldPassCount() to know which arguments need to be
+	passed to TiffSetField()
+
+	fixes #18
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2202
+
+2020-04-04  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffinfo: showdata for tiled images.
+
+	tiffinfo: fix dump of Tiled images.
+	fixes #80
+
+2020-04-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-117' into 'master'
+	tiffcrop: enforce memory allocation limit
+
+	Closes #117
+
+	See merge request libtiff/libtiff!140
+
+2020-04-03  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop: enforce memory allocation limit.
+	uses -k option to change limit (default to 256MiB)
+	fixes #117 / http://bugzilla.maptools.org/show_bug.cgi?id=2757
+
+2020-04-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-45' into 'master'
+	tiffcp: disable strip chopping when trying to convert to JBIG compression
+
+	Closes #45
+
+	See merge request libtiff/libtiff!138
+
+2020-04-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-124' into 'master'
+	TIFFGetFields(3tiff): TIFFTAG_*BYTECOUNTS TIFFTAG_*OFFSETS are uint64
+
+	Closes #124
+
+	See merge request libtiff/libtiff!137
+
+2020-04-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'aix_itrunc' into 'master'
+	Rename itrunc to fix name clash with a different itrunc in math.h on AIX. Fixes issue #189
+
+	Closes #189
+
+	See merge request libtiff/libtiff!139
+
+2020-04-01  Rob Boehne  <robb@datalogics.com>
+
+	Rename itrunc to fix name clash with a different itrunc in math.h on AIX. Fixes issue #189
+
+2020-04-01  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcp: disable strip chopping when trying to convert to JBIG compression
+	fixes #45
+
+2020-03-29  Thomas Bernard  <miniupnp@free.fr>
+
+	TIFFGetFields(3tiff): TIFFTAG_*BYTECOUNTS TIFFTAG_*OFFSETS are uint64.
+	fixes #124 / http://bugzilla.maptools.org/show_bug.cgi?id=2774
+
+2020-03-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-48' into 'master'
+	tiff2pdf: fix "raw" copy of Deflate streams
+
+	Closes #48
+
+	See merge request libtiff/libtiff!136
+
+2020-03-27  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf: fix "raw" copy of Deflate streams.
+	The Predictor parametter was not copied from the source tiff to the PDF.
+	fixes #48 / http://bugzilla.maptools.org/show_bug.cgi?id=2442
+
+2020-03-26  Thomas Bernard  <miniupnp@free.fr>
+
+	tif_fax3: quit Fax3Decode2D() when a buffer overflow occurs.
+	fixes #186
+
+2020-03-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-143-144' into 'master'
+	tiffdump: avoid unaligned memory access
+
+	Closes #144 et #143
+
+	See merge request libtiff/libtiff!133
+
+2020-03-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-133' into 'master'
+	tiff2pdf: avoid divide by 0
+
+	Closes #133
+
+	See merge request libtiff/libtiff!126
+
+2020-03-24  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf: normalizePoint() macro to normalize the white point.
+
+2020-03-23  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffdump: avoid unaligned memory access.
+	fixes #143
+	fixes #144
+
+2020-03-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'out-of-memory' into 'master'
+	tiffcp/tiff2pdf/tiff2ps: enforce maximum malloc size
+
+	Closes #153, #84, #116 et #115
+
+	See merge request libtiff/libtiff!130
+
+2020-03-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-157' into 'master'
+	tiffset: check memory allocation
+
+	Closes #157
+
+	See merge request libtiff/libtiff!132
+
+2020-03-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-185' into 'master'
+	tif_fax3: more buffer overflow checks in Fax3Decode2D()
+
+	Closes #185
+
+	See merge request libtiff/libtiff!131
+
+2020-03-23  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffset: check memory allocation.
+	fixes #157 / http://bugzilla.maptools.org/show_bug.cgi?id=2850
+
+	tif_fax3: more buffer overflow checks in Fax3Decode2D()
+	fixes #185
+
+2020-03-21  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps: enforce memory allocation limit.
+	fixes #153 / http://bugzilla.maptools.org/show_bug.cgi?id=2845
+
+	tiff2pdf: enforce maximum data size.
+	fixes #116 / http://bugzilla.maptools.org/show_bug.cgi?id=2756
+	fixes #84 / http://bugzilla.maptools.org/show_bug.cgi?id=2683
+
+	update man page for tiffcp regarding the -m option.
+
+	tiffcp.c:  _TIFFmalloc() => limitMalloc()
+
+2020-03-21  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcp: enforce maximum malloc size.
+	default is 256MB. use -m option to change
+
+	fixes #115 / http://bugzilla.maptools.org/show_bug.cgi?id=2755
+
+2020-03-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-184' into 'master'
+	CmakeLists.txt: define WORDS_BIGENDIAN when the CPU is big endian
+
+	Closes #184
+
+	See merge request libtiff/libtiff!127
+
+2020-03-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-44' into 'master'
+	tiff2pdf: "" causes the relevant argument not to be written
+
+	Closes #44
+
+	See merge request libtiff/libtiff!128
+
+2020-03-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-56' into 'master'
+	fix man for TIFFReadEncodedStrip(), TIFFStripSize, TIFFVStripSize, TIFFRawStripSize
+
+	Closes #56
+
+	See merge request libtiff/libtiff!129
+
+2020-03-20  Thomas Bernard  <miniupnp@free.fr>
+
+	fix man for TIFFReadEncodedStrip(), TIFFStripSize, TIFFVStripSize, TIFFRawStripSize
+	fixes #56
+	http://bugzilla.maptools.org/show_bug.cgi?id=2507
+
+	tiff2pdf: "" causes the relevant argument not to be written.
+	fixes #44
+
+	CmakeLists.txt: define WORDS_BIGENDIAN when the CPU is big endian.
+	fixes #184
+
+2020-03-17  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf: avoid divide by 0.
+	fixes #133 http://bugzilla.maptools.org/show_bug.cgi?id=2796
+
+2020-03-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-22' into 'master'
+	do not _tiffMapProc 0 size files
+
+	Closes #22
+
+	See merge request libtiff/libtiff!125
+
+2020-03-13  Thomas Bernard  <miniupnp@free.fr>
+
+	tif_win32.c: do not _tiffMapProc() 0 sized files.
+	see #22
+
+	tif_unix.c: do not _tiffMapProc 0 size files.
+	fixes #22
+	http://bugzilla.maptools.org/show_bug.cgi?id=2249
+
+2020-03-12  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_fax3.c: fix warning C4018: '<': signed/unsigned mismatch introduced in past commits
+
+2020-03-11  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff.h: mention TIFFTAG_RPCCOEFFICIENT, TIFFTAG_TIFF_RSID, TIFFTAG_GEO_METADATA
+
+2020-03-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-60' into 'master'
+	added support for more private tags
+
+	Closes #60
+
+	See merge request libtiff/libtiff!124
+
+2020-03-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-160' into 'master'
+	Fax3SetupState(): check consistency of rowbytes and rowpixels
+
+	Closes #160
+
+	See merge request libtiff/libtiff!123
+
+2020-03-11  Thomas Bernard  <miniupnp@free.fr>
+
+	added support for more private tags.
+	see https://gitlab.com/libtiff/libtiff/-/issues/60
+	bugzilla.maptools.org/show_bug.cgi?id=2525
+
+	closes #60
+
+	original author : art1@andreas-romeyke.de
+
+2020-03-11  Thomas Bernard  <miniupnp@free.fr>
+
+	Fax3SetupState(): check consistency of rowbytes and rowpixels.
+	also add some parameter documentation to Fax3Decode1D()
+
+	fixes #160
+	http://bugzilla.maptools.org/show_bug.cgi?id=2854
+
+2020-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-11-const-pointers' into 'master'
+	Make pointers returned via TIFFGetField const
+
+	Closes #11
+
+	See merge request libtiff/libtiff!118
+
+2020-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_ojpeg.c: relax again too strict sanity checks to allow reading of valid images such as https://gitlab.com/libtiff/libtiff/-/issues/181#note_302535232. Fixes #181
+
+2020-03-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-52' into 'master'
+	contrib/win_dib/tiff2dib: fix Uninitialized variable: lpBits
+
+	Closes #52
+
+	See merge request libtiff/libtiff!121
+
+2020-03-09  Thomas Bernard  <miniupnp@free.fr>
+
+	contrib/win_dib/tiff2dib: fix Uninitialized variable: lpBits.
+	fixes #52
+	http://bugzilla.maptools.org/show_bug.cgi?id=2469
+
+2020-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-58' into 'master'
+	Make TIFFTAG_CFAPATTERN variable count
+
+	Closes #58
+
+	See merge request libtiff/libtiff!120
+
+2020-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-158-no-predictor-in-webp' into 'master'
+	TIFFTAG_PREDICTOR is not supported for WebP
+
+	Closes #158
+
+	See merge request libtiff/libtiff!119
+
+2020-03-08  Sam Hasinoff  <hasinoff@google.com>
+
+	Make TIFFTAG_CFAPATTERN variable count.
+	The TIFFTAG_CFAPATTERN tag (33422) from TIFF/EP, recently introduced in libtiff
+	3363eda09d082e3e1dfffa6281f53085cac51ad3 / http://bugzilla.maptools.org/show_bug.cgi?id=2457
+	is described as having a fixed count of 4.
+	But the TIFF/EP spec says this should support a variable count (= CFARepeatRows * CFARepeatCols):
+
+	TIFF/EP, ISO 12234-2:2001
+	http://www.barrypearson.co.uk/top2009/downloads/TAG2000-22_DIS12234-2.pdf
+	page 18 and 26
+
+2020-03-08  Thomas Bernard  <miniupnp@free.fr>
+
+	TIFFTAG_PREDICTOR is not supported for WebP.
+	fixes #158
+	https://gitlab.com/libtiff/libtiff/-/issues/158
+
+	this bug was introduced by 9eacd59fecc4ef593ac17689bc530ab451c8ec14
+	merge request !32
+
+2020-03-07  Adam Goode  <adam@spicenitz.org>
+
+	Make the default whitepoint and ycbcrcoeffs arrays const.
+	Now that we are returning const pointers in TIFFGetFieldDefaulted,
+	we can now make these static default arrays const.
+
+	see #11
+
+2020-03-07  Adam Goode  <adam@spicenitz.org>
+
+	Make pointers returned via TIFFGetField const.
+	According to http://bugzilla.maptools.org/show_bug.cgi?id=2125#c6
+	callers are not allowed to modify pointer or array values returned from
+	TIFFGetField or the like. So, make this explicit in the documentation
+	by specifying these things as const. Note that this is not an ABI
+	change, since C does not encode const in libraries. Also, this is
+	not really an API change, since the varargs call strips away all
+	the types anyway. So it really is more of a documentation change.
+
+	fixes #11
+
+2020-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	CMake: Skip custom_dir_EXIF_231 test on shared builds to avoid issues on Windows
+
+2020-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'EXIF231_GPS_upgrade' into 'master'
+	EXIF 2.32 and GPS TIFF-tags and functionality upgraded.
+
+	See merge request libtiff/libtiff!91
+
+2020-03-07  Su_Laus  <sulau@freenet.de>
+
+	EXIF 2.32 and GPS tags and functionality upgraded.
+	- Existing EXIF field definition of tags is upgraded to EXIF version 2.3.2
+	- EXIF-GPS structure, tags and access functions are added as special CustomDirectory (like it was done for EXIF).
+	- Test program custom_dir_EXIF_231.c added to test writing/reading of EXID IFD and GPS IFD tags
+	  and to highlight some quirks of IFD-handling and peculiarities of reading/writing the different data types.
+	- Reading error for FileSource and SceneType tags corrected.
+
+	- EXIF_GPS_upgrade rebased onto c8c5309b765ef4ff097d2aaffbdb8f403db8967d (Merge branch 'Rational2DoublePrecision_correction' into 'master')
+	and adapted:
+	- tif_dirinfo.c:         All rational tags set to TIFF_SETGET_FLOAT but only the GPSTAG_ tags set to TIFF_SETGET_DOUBLE.
+	- custom_dir_EXIF_231.c: Editorials amended and gcc warnigs fixed.
+	- CMakeLists.txt: add_test(NAME "custom_dir_EXIF_231"  COMMAND "custom_dir_EXIF_231")  added.
+
+2020-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-55' into 'master'
+	ppm2tiff: support any bps value from 1 to 16
+
+	Closes #55
+
+	See merge request libtiff/libtiff!106
+
+2020-03-07  Thomas Bernard  <miniupnp@free.fr>
+
+	ppm2tiff: Add test for 16bpc PPM.
+
+	ppm2tiff: remove unused argument warning.
+
+2020-03-07  Ludolf Holzheid  <ludolf.holzheid@gmx.de>
+
+	ppm2tiff: support any bps value from 1 to 16.
+	fix #55
+	http://bugzilla.maptools.org/show_bug.cgi?id=2505
+
+	Patch originally submited by Ludolf Holzheid <ludolf.holzheid@gmx.de>
+
+2020-03-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fax-test' into 'master'
+	add test for fax4 decoding
+
+	See merge request libtiff/libtiff!114
+
+2020-03-05  Thomas Bernard  <miniupnp@free.fr>
+
+	add test for fax4 decoding.
+	This will check for regression on #46
+	https://gitlab.com/libtiff/libtiff/issues/46
+	http://bugzilla.maptools.org/show_bug.cgi?id=2434
+
+2020-03-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'freebsd-tests' into 'master'
+	make tests pass under FreeBSD.
+
+	See merge request libtiff/libtiff!113
+
+2020-03-05  Thomas Bernard  <miniupnp@free.fr>
+
+	make tests pass under FreeBSD.
+	the -I option for the GNU diff and the FreeBSD diff
+	behaves differently regarding escaping the ( ) and |
+
+	By using two -I option, we avoid using such charracters.
+
+2020-03-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-31' into 'master'
+	HTML
+
+	Closes #31
+
+	See merge request libtiff/libtiff!111
+
+2020-03-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-179' into 'master'
+	tif_fax3.h: check for buffer overflow in EXPAND2D before "calling" CLEANUP_RUNS()
+
+	Closes #179
+
+	See merge request libtiff/libtiff!112
+
+2020-03-05  Thomas Bernard  <miniupnp@free.fr>
+
+	v4.1.0.html: fix for validation.
+	long <!----------------> comments were replaced
+	because they confused some parsers
+
+	add DOCTYPE on v*.html.
+
+	fix HTML files so they are valid according to https://validator.w3.org.
+
+2020-03-05  Thomas Bernard  <miniupnp@free.fr>
+
+	tif_fax3.h: check for buffer overflow in EXPAND2D before "calling" CLEANUP_RUNS()
+	fixes #179
+
+	this fixes the regression introduced in 02bb0175 / 72c4acef
+	( merge request !110 )
+
+	It may be a better fix to do the overflow check in SETVALUE() but the
+	macro do { } while(0) construct makes it difficult to quit the loop
+	properly.
+
+2020-03-01  Thomas Bernard  <miniupnp@free.fr>
+
+	index.html: fix unclosed <tt> tag.
+
+2020-03-01  Thomas Bernard  <miniupnp@free.fr>
+
+	html: do not force colors (which are default anyway)
+	If needed, style should be set using CSS.
+
+	fixes #31
+	https://gitlab.com/libtiff/libtiff/issues/31
+	http://bugzilla.maptools.org/show_bug.cgi?id=2326
+
+2020-03-01  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadCustomDirectory(): fix potential heap buffer overflow when reading a custom directory, after a regular directory where a codec was active. Fixes https://gitlab.com/libtiff/libtiff/issues/178
+
+2020-03-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-46' into 'master'
+	fix decoding of fax4 images
+
+	Closes #46
+
+	See merge request libtiff/libtiff!110
+
+2020-02-29  Thomas Bernard  <miniupnp@free.fr>
+
+	tif_fax3: better fix for CVE-2011-0192.
+	There are some legitimate case which were forbidden by the previous fix
+
+	tif_fax3.h: allow 0 length run in DECODE2D.
+	fixes #46
+	https://gitlab.com/libtiff/libtiff/issues/46
+	http://bugzilla.maptools.org/show_bug.cgi?id=2434
+
+2020-02-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'mingwlibm' into 'master'
+	Don't use libm with libtiff due to conflict with libmsvcrt
+
+	See merge request libtiff/libtiff!73
+
+2020-02-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Rational2DoublePrecision_correction' into 'master'
+	tif_dirwrite.c: bugfix DoubleToSrational() for plain signed integers
+
+	See merge request libtiff/libtiff!109
+
+2020-02-29  Su_Laus  <sulau@freenet.de>
+
+	tif_dirwrite.c: bugfix DoubleToSrational(), which returns plain signed interger values always as unsigned rationals. Add a test into rational_precision2double.c for "-1.0" and some editorials in tif_dirwrite.c. (code is related to 6df997c786928757caea0dd68d26ea5f098f49df changes).
+
+2020-02-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-174' into 'master'
+	tif_fax3.c: check buffer overflow in Fax4Decode()
+
+	Closes #174
+
+	See merge request libtiff/libtiff!108
+
+2020-02-29  Thomas Bernard  <miniupnp@free.fr>
+
+	Fax4Decode(): log error message in case of buffer overrun.
+
+	tif_fax3.c: check buffer overflow in Fax4Decode()
+	fixes #174
+
+2020-02-28  Even Rouault  <even.rouault@spatialys.com>
+
+	typo fixes in code comments.
+
+	ToRationalEuclideanGCD: remove useless test that confuses Coverity Scan about a potential later modulo by zero
+
+2020-02-27  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirwrite.c: fix other warnings related to 6df997c786928757caea0dd68d26ea5f098f49df changes
+
+	rational_precision2double.c: fix many warnings, and do not build it on CMake on shared lib builds
+
+	tif_dirwrite.c: fix various warnings found when building GDAL with internal libtiff after 6df997c786928757caea0dd68d26ea5f098f49df changes
+
+	tif_dirwrite.c: qualify ToRationalEuclideanGCD() with static.
+
+2020-02-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Rational2DoublePrecision' into 'master'
+	Rational with Double Precision Upgrade
+
+	See merge request libtiff/libtiff!100
+
+2020-02-27  Su_Laus  <sulau@freenet.de>
+
+	Rational with Double Precision Upgrade.
+	Unfortunately, custom rational tags (TIFF_RATIONAL with field_bit=FIELD_CUSTOM) are defined as TIFF_SETGET_DOUBLE
+	but for the reading interface and LibTiff internally they are stored ALLWAYS as floating point SINGLE precision.
+	Double precision custom rational tags are not supported by LibTiff.
+
+	For the GPS tags in WGS84 a higher accuracy / precision is needed.
+	Therefore, this upgrade is made, keeping the old interface for the already defined tags and allowing a double precision definition,
+	as well as calculating rationals with higher accuracy / precision.
+	This higher accuracy can be used for newly defined tags like that in EXIF/GPS.
+
+	Refer also to the very old Bugzilla issue 2542 (#69)
+
+	A test file rational_precision2double.c is added, which shows prevention of the old interface to the already defined custom rational tags
+	with the standard library as well as with the upgraded library.
+
+	Also TIFFTAG_XRESOLUTION, TIFFTAG_YRESOLUTION, TIFFTAG_XPOSITION, TIFFTAG_YPOSITION amended from TIFF_SETGET_DOUBLE to TIFF_SETGET_FLOAT and testcase inserted in rational_precision2double.c
+
+2020-02-26  Chris Degawa  <ccom@randomderp.com>
+
+	mingw-w64 cmake: Don't find libm.
+	mingw-w64 will provide libm symbols by default without -lm and mingw-64's
+	libm is just a stub.
+
+	This is just to make sure that on systems with msys2 and also cygwin, cmake
+	doesn't find a libm that actually contains math functions.
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'division-by-zero' into 'master'
+	tools/tiffcp.c: fix potential division by zero
+
+	See merge request libtiff/libtiff!83
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-unused-warning' into 'master'
+	warnings: mark conditionally used parameters
+
+	See merge request libtiff/libtiff!49
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	fix issue #78 warnings regarding RichTIFFIPTC data type
+
+	Closes #78
+
+	See merge request libtiff/libtiff!99
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'win64-handle-casts-warn-fix' into 'master'
+	Avoid warnings about casts between HANDLE and int in Win64 builds
+
+	Closes #2
+
+	See merge request libtiff/libtiff!93
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2839' into 'master'
+	raw2tiff: avoid divide by 0
+
+	Closes #151
+
+	See merge request libtiff/libtiff!103
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2669' into 'master'
+	tiff2pdf: palette bound check in t2p_sample_realize_palette()
+
+	Closes #82
+
+	See merge request libtiff/libtiff!104
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'int-shift' into 'master'
+	tiffcrop: fix asan runtime error caused by integer promotion
+
+	See merge request libtiff/libtiff!105
+
+2020-02-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug-2538' into 'master'
+	libtiff.html: fix function casing
+
+	Closes #68
+
+	See merge request libtiff/libtiff!107
+
+2020-02-16  Thomas Bernard  <miniupnp@free.fr>
+
+	raw2tiff: avoid divide by 0.
+	fixes #151 / http://bugzilla.maptools.org/show_bug.cgi?id=2839
+
+	first memcmp() lines before computing corellation
+	and always avoid divide by 0 anyway
+
+2020-02-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2855' into 'master'
+	tiff2ps: fix heap buffer read overflow in PSDataColorContig()
+
+	Closes #161
+
+	See merge request libtiff/libtiff!102
+
+2020-02-08  Thomas Bernard  <miniupnp@free.fr>
+
+	libtiff.html: fix function casing.
+
+	libtiff.html: fix function casing.
+	fixes #68 / http://bugzilla.maptools.org/show_bug.cgi?id=2538
+
+2020-02-08  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop: fix asan runtime error caused by integer promotion.
+	tiffcrop.c:4027:20: runtime error: left shift of 190 by 24 places cannot be represented in type 'int'
+
+	C treats (byte << 24) as an int expression.
+	casting explicitely to unsigned type uint32 avoids the problem.
+
+	the same issue has been fixed elsewhere with a24213691616e7cd35aa3e2805493de80c7e4fcf
+
+	I detected the bug with the test file of #86
+
+2020-02-08  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf: palette bound check in t2p_sample_realize_palette()
+	fixes #82
+
+2020-02-08  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps: fix heap buffer read overflow in PSDataColorContig()
+	fixes #161 / http://bugzilla.maptools.org/show_bug.cgi?id=2855
+
+	in 05029fb7f1ecf771abaf90b5705b6cab9eb522a7 I missed that 1 extra byte is read
+	in this loop.
+
+2020-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirread.c: suppress CLang static Analyzer 9.0 false positive.
+
+2020-02-01  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFSetupStrips: enforce 2GB limitation of Strip/Tile Offsets/ByteCounts arrays
+	TIFFWriteDirectoryTagData() has an assertion that checks that the
+	arrays are not larger than 2GB. So error out earlier if in that situation.
+
+2020-01-29  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Simplify nmake configuration for building port directory.  Now there is only one boolean setting to enable building strtoll() and strtoull() port functions.  The boolean setting enables the necessary port files to be built, but the remainder of the logic is via pre-processor code in the common tif_config.h, which was prepared before entering the port directory to do a build.
+
+2020-01-28  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Make sure that tif_config.h is produced prior to entering the port directory and add an include path so that the port files can include tif_config.h.  Do not actually include tif_config.h at this time since CMake and Autotools builds are not prepared for that.  This issue could be handled by updating the CMake and Autotools builds or by adding a define which directs libport.h to include tif_config.h.
+
+2020-01-26  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Fix nmake build mistakes in my last commit:
+	tif_config.vc.h:
+
+	  Always define HAVE_STRTOL/HAVE_STRTOUL.
+	  Define HAVE_STRTOLL/HAVE_STRTOULL if _MSC_VER >= 1900.
+
+	nmake.opt:
+
+	  Provide defaults suitable for MSVC prior to 14.0.
+
+	libport.h:
+
+	  The sense of the pre-processor logic was inverted from what it
+	  should be.  The intention is to only provide the prototype if the
+	  function is missing.
+
+2020-01-25  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Add nmake build support for manually configuring the 'port' files to be built based on MSVC features. Include tif_config.h in tools/tiffset.c.
+
+2020-01-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Adjust previous fix to avoid undue warning in some situations triggered by GDAL
+
+2020-01-12  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFPartialReadStripArray: bring back support for non-conformant SLONG8 data type
+	Such as in https://github.com/OSGeo/gdal/issues/2165
+
+2020-01-07  Even Rouault  <even.rouault@spatialys.com>
+
+	test: add test for single-strip OJPEG file without RowsPerStrip tag (like in CR2 files)
+
+	OJPEGReadHeaderInfo: if rowsperstrip not defined, then assume one-single-strip. Complementary fix to 0356ea76bac908c61160d735f078437ace953bd3
+
+2019-12-16  Angel Sánchez  <angelsanchez@inedit.com>
+
+	fix issue #78 warnings regarding RichTIFFIPTC data type.
+
+2019-12-14  Even Rouault  <even.rouault@spatialys.com>
+
+	contrib/oss-fuzz/build.sh: fix broken if construct.
+
+2019-11-28  Even Rouault  <even.rouault@spatialys.com>
+
+	contrib/oss-fuzz/build.sh: other attempt at fixing build failure.
+
+2019-11-20  Even Rouault  <even.rouault@spatialys.com>
+
+	contrib/oss-fuzz/build.sh: install liblzma-dev for x86_64 builds.
+
+2019-11-17  Even Rouault  <even.rouault@spatialys.com>
+
+	contrib/oss-fuzz/build.sh: install liblzma-dev:i386 on i386 builds.
+
+2019-11-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake-parse' into 'master'
+	CMake: simplify parsing variables from configure
+
+	See merge request libtiff/libtiff!98
+
+2019-11-15  Rolf Eike Beer  <eb@emlix.com>
+
+	CMake: simplify parsing variables from configure.
+
+2019-11-14  Even Rouault  <even.rouault@spatialys.com>
+
+	contrib/oss-fuzz/build.sh: fix ossfuzz build by statically linking to lzma
+
+2019-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_ojpeg_172' into 'master'
+	OJPEG: fix broken sanity check added in 4.1.0 (#fixes 172)
+
+	See merge request libtiff/libtiff!97
+
+2019-11-11  Even Rouault  <even.rouault@spatialys.com>
+
+	OJPEG: fix broken sanity check added in 4.1.0, and add two OJPEG test files
+
+	test/: add missing generated .sh files.
+
+2019-11-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-missing-checks-TIFFGetField-tiffcrop' into 'master'
+	adds missing checks on TIFFGetField in tiffcrop tool
+
+	Closes #170
+
+	See merge request libtiff/libtiff!96
+
+2019-11-04  Bug Checkers  <coolbugcheckers@gmail.com>
+
+	adds missing checks on TIFFGetField in tiffcrop tool (fixes #170)
+
+2019-11-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'adds-missing-TIFFClose-rgb2ycbcr' into 'master'
+	adds a missing TIFFClose in rgb2ycbcr tool
+
+	See merge request libtiff/libtiff!95
+
+2019-11-04  Mansour Ahmadi  <mansourweb@gmail.com>
+
+	adds a missing TIFFClose in rgb2ycbcr tool.
+
+2019-11-03  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	libtiff 4.1.0 released.
+
+	Added a step for updating the legacy ChangeLog file.
+
+	Ignore emacs temporary files (ending with tilde character).
+
+	Added release summary page for the 4.1.0 release.
+
+	Fix Cmake HAVE_GETOPT for systems which declare getopt in stdio.h. Fix utility baked-in getopt prototype which appears when HAVE_GETOPT is not defined.
+
+	Fax2tiff.sh needs to remove its output file in advance. Syntax changes so that bash is not required.
+
+2019-10-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: extra cast to silence Coverity warning. GDAL CID 1406475.
+
+2019-10-23  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: fix warning added by previous commit (on 32bit builds)
+
+2019-10-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'coverity-fixes' into 'master'
+	Coverity fixes
+
+	See merge request libtiff/libtiff!94
+
+2019-10-22  Timothy Lyanguzov  <timothy.lyanguzov@sap.com>
+
+	Use 64-bit calculations correctly.
+
+	Fix size calculation to use 64-bit tmsize_t correctly.
+
+	Make bytesperclumpline calculations using tmsize_t type.
+
+2019-10-03  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_read: align code of TIFFReadRawStrip() and TIFFReadRawTile() that differed for non good reason. Non-functional change normally. (fixes GitLab #162)
+
+2019-10-01  Even Rouault  <even.rouault@spatialys.com>
+
+	HTML: update for GitLab issues.
+
+2019-09-29  Even Rouault  <even.rouault@spatialys.com>
+
+	html/v3.5.6-beta.html: redact URL of defunct web site.
+
+	Website: update links to mailing list.
+
+2019-09-17  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadAndRealloc(): avoid too large memory allocation attempts. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=17244
+
+2019-09-03  Even Rouault  <even.rouault@spatialys.com>
+
+	ByteCountLooksBad and EstimateStripByteCounts: avoid unsigned integer overflows. Fixes https://oss-fuzz.com/testcase-detail/5686156066291712 and https://oss-fuzz.com/testcase-detail/6332499206078464
+
+2019-09-02  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_ojpeg.c: avoid relying on isTiled macro being wrapped in ()
+
+	tif_ojpeg.c: avoid use of uninitialized memory on edge/broken file. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16844
+
+	tiff_read_rgba_fuzzer.cc: add a -DSTANDALONE mode for easier reproduction of oss-fuzz reports
+
+2019-09-01  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirread.c: allocChoppedUpStripArrays(). avoid unsigned integer overflow. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16846
+
+2019-08-27  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_ojpeg.c: avoid unsigned integer overflow. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16793
+
+2019-08-26  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadDirEntryData(): rewrite to avoid unsigned integer overflow (not a bug). Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16792
+
+	TIFFFetchDirectory(): fix invalid cast from uint64 to tmsize_t. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16784
+
+2019-08-25  Even Rouault  <even.rouault@spatialys.com>
+
+	JPEG: avoid use of unintialized memory on corrupted files.
+	Follow-up of cf3ce6fab894414a336546f62adc57f02590a22c
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16602
+	Credit to OSS Fuzz
+
+2019-08-23  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFPartialReadStripArray(): avoid unsigned integer overflow. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16685
+
+	OJPEGWriteHeaderInfo(): avoid unsigned integer overflow on strile dimensions close to UINT32_MAX. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16683
+
+	TIFFFillStrip(): avoid harmless unsigned integer overflow. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16653
+
+	EstimateStripByteCounts(): avoid unsigned integer overflow. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16643&
+
+	tif_ojpeg: avoid unsigned integer overflow (probably not a bug). Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16635
+
+	tif_thunder: avoid unsigned integer overflow (not a bug). Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16632
+
+2019-08-22  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFMultiply32() / _TIFFMultiply64(): avoid relying on unsigned integer overflow (not a bug)
+
+	EstimateStripByteCounts(): avoid unsigned integer overflow.
+
+2019-08-21  Even Rouault  <even.rouault@spatialys.com>
+
+	EstimateStripByteCounts(): avoid unsigned integer overflow.
+
+2019-08-20  Even Rouault  <even.rouault@spatialys.com>
+
+	EstimateStripByteCounts(): avoid harmless unsigned integer overflow.
+
+	_TIFFPartialReadStripArray(): avoid triggering unsigned integer overflow with -fsanitize=unsigned-integer-overflow (not a bug, this is well defined by itself)
+
+2019-08-18  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff2ps: fix use of wrong data type that caused issues (/Height being written as 0) on 64-bit big endian platforms
+
+2019-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	setByteArray(): fix previous commit.
+
+	setByteArray(): avoid potential signed integer overflow. Pointed by Hendra Gunadi. No actual problem known (which does not mean there wouldn't be any. Particularly on 32bit builds)
+
+2019-08-15  Even Rouault  <even.rouault@spatialys.com>
+
+	RGBA interface: fix integer overflow potentially causing write heap buffer overflow, especially on 32 bit builds. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16443. Credit to OSS Fuzz
+
+2019-08-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_integer_overflow' into 'master'
+	Fix integer overflow in _TIFFCheckMalloc() and other implementation-defined behaviour (CVE-2019-14973)
+
+	See merge request libtiff/libtiff!90
+
+2019-08-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix integer overflow in _TIFFCheckMalloc() and other implementation-defined behaviour (CVE-2019-14973)
+	_TIFFCheckMalloc()/_TIFFCheckRealloc() used a unsafe way to detect overflow
+	in the multiplication of nmemb and elem_size (which are of type tmsize_t, thus
+	signed), which was especially easily triggered on 32-bit builds (with recent
+	enough compilers that assume that signed multiplication cannot overflow, since
+	this is undefined behaviour by the C standard). The original issue which lead to
+	this fix was trigged from tif_fax3.c
+
+	There were also unsafe (implementation defied), and broken in practice on 64bit
+	builds, ways of checking that a uint64 fits of a (signed) tmsize_t by doing
+	(uint64)(tmsize_t)uint64_var != uint64_var comparisons. Those have no known
+	at that time exploits, but are better to fix in a more bullet-proof way.
+	Or similarly use of (int64)uint64_var <= 0.
+
+2019-08-12  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFClientOpen(): fix memory leak if one of the required callbacks is not provided. Fixed Coverity GDAL CID 1404110
+
+	OJPEGReadBufferFill(): avoid very long processing time on corrupted files. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16400. master only
+
+2019-08-10  Even Rouault  <even.rouault@spatialys.com>
+
+	oss-fuzz/tiff_read_rgba_fuzzer.cc: fix wrong env variable value in previous commit
+
+	oss-fuzz/tiff_read_rgba_fuzzer.cc: avoid issue with libjpeg-turbo and MSAN
+
+	OJPEG: fix integer division by zero on corrupted subsampling factors. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15824. Credit to OSS Fuzz
+
+	Merge branch 'ossfuzz_i386'
+
+	contrib/oss-fuzz/build.sh: fix for i386 build of jbigkit, and use $LIB_FUZZING_ENGINE
+
+2019-08-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'patch-1' into 'master'
+	fix two tiny typos
+
+	See merge request libtiff/libtiff!89
+
+2019-08-10  Reto Kromer  <rk@reto.ch>
+
+	fix two tiny typos.
+
+2019-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'patch-1' into 'master'
+	fix a typo in man page
+
+	See merge request libtiff/libtiff!88
+
+2019-08-09  Reto Kromer  <rk@reto.ch>
+
+	fix typo.
+
+2019-08-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFTAGID_Zero_reading_IGNORE' into 'master'
+	Suppressed Reading of Tiff tags with ID = 0 (like GPSVERSIONID) corrected.
+
+	See merge request libtiff/libtiff!77
+
+2019-08-04  Su Laus  <sulau@freenet.de>
+
+	Reading of Tiff tags with ID = 0 (like GPSVERSIONID) corrected.
+	  IGNORE placeholder in tif_dirread.c is now replaced by a field dir_ignore in the TIFFDirEntry structure
+
+	  Currently, in tif_dirread.c a special IGNORE value for the tif tags is defined
+	  in order to flag status preventing already processed tags from further processing.
+	  This irrational behaviour prevents reading of custom tags with id code 0 - like tag GPSVERSIONID from EXIF 2.31 definition.
+
+	  An additional field 'tdir_ignore' is now added to the TIFFDirEntry structure and code is changed
+	  to allow tags with id code 0 to be read correctly.
+
+	  This change was already proposed as pending improvement in tif_dirread.c around line 32.
+
+	    Reference is also made to:
+		- Discussion in https://gitlab.com/libtiff/libtiff/merge_requests/39
+		- http://bugzilla.maptools.org/show_bug.cgi?id=2540
+
+	Comments and indention adapted.
+
+	Preparation to rebase onto master
+
+2019-07-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake_amd64' into 'master'
+	CMakeLists.txt: properly set value of HOST_FILLORDER to LSB2MSB for Windows CMake builds
+
+	See merge request libtiff/libtiff!87
+
+2019-07-15  Even Rouault  <even.rouault@spatialys.com>
+
+	CMakeLists.txt: properly set value of HOST_FILLORDER to LSB2MSB for Windows CMake builds
+	As can be seen in https://ci.appveyor.com/project/rleigh-codelibre/libtiff-didfs/builds/25846668/job/ory5w098j8wcij9x
+	log, the HOST_FILLORDER is not properly set:
+
+	[00:02:58] -- CMAKE_HOST_SYSTEM_PROCESSOR set to AMD64
+	[00:02:58] -- HOST_FILLORDER set to FILLORDER_MSB2LSB
+
+	Ther reason is that we match the "amd64.*" lowercase string whereas
+	CMAKE_HOST_SYSTEM_PROCESSOR is set to AMD64 uppercase.
+
+2019-07-09  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFWriteCheck(): call TIFFForceStrileArrayWriting() when needed (should have gone with eaeca6274ae71cdfaeb9f673b6fb0f3cfc0e6ce5) (master only)
+
+2019-07-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_chromium_925269' into 'master'
+	OJPEG: avoid use of unintialized memory on corrupted files
+
+	See merge request libtiff/libtiff!86
+
+2019-07-05  Even Rouault  <even.rouault@spatialys.com>
+
+	OJPEG: avoid use of unintialized memory on corrupted files.
+	Fixes https://bugs.chromium.org/p/chromium/issues/detail?id=925269
+	Patch from Lei Zhang with little adaptations.
+
+2019-06-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-division-by-zero' into 'master'
+	Return infinite distance when denominator is zero.
+
+	See merge request libtiff/libtiff!85
+
+2019-06-29  Dirk Lemstra  <dirk@lemstra.org>
+
+	Return infinite distance when denominator is zero.
+
+2019-06-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'typetests' into 'master'
+	Add test to check that libtiff types have the correct size
+
+	See merge request libtiff/libtiff!57
+
+2019-05-31  Thomas Bernard  <miniupnp@free.fr>
+
+	make TIFF_SSIZE_T the same bitwidth as TIFF_SIZE_T.
+	it was previously the same bitwidth as unsigned char *
+	Pointers can be larger than size_t.
+
+2019-05-31  Thomas Bernard  <miniupnp@free.fr>
+
+	Add test to check that libtiff types have the correct size.
+	in configure/CMakeList.txt :
+
+	- TIFF_INT8_T/TIFF_UINT8_T is signed/unsigned char
+	sizeof(char)==1 in C standard
+	- TIFF_INT16_T/TIFF_UINT16_T is signed/unsigned short
+	sizeof(short)>=2 in C standard
+	- TIFF_INT32_T/TIFF_UINT32_T is defined so its sizeof() is 4
+
+	- TIFF_INT64_T/TIFF_UINT64_T is defined so its sizeof() is 8
+
+	- TIFF_SIZE_T is defined so it has same sizeof() than size_t
+
+	- TIFF_SSIZE_T is defined so it has same sizeof() than unsigned char *
+
+2019-05-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'defer_strile_writing' into 'master'
+	Add TIFFDeferStrileArrayWriting() and TIFFForceStrileArrayWriting()
+
+	See merge request libtiff/libtiff!82
+
+2019-05-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFReadFromUserBuffer' into 'master'
+	Add TIFFReadFromUserBuffer()
+
+	See merge request libtiff/libtiff!81
+
+2019-05-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix vulnerability in 'D' (DeferStrileLoad) mode (master only) (fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=14908)
+
+2019-05-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Replace 'stripped' by 'striped' in error messages.
+
+2019-05-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Add TIFFDeferStrileArrayWriting() and TIFFForceStrileArrayWriting()
+	Those advanced writing functions must be used in a particular sequence
+	to make their intended effect. Their aim is to control when/where
+	the [Strip/Tile][Offsets/ByteCounts] arrays are written into the file.
+
+	The purpose of this is to generate 'cloud-optimized geotiff' files where
+	the first KB of the file only contain the IFD entries without the potentially
+	large strile arrays. Those are written afterwards.
+
+	The typical sequence of calls is:
+	TIFFOpen()
+	[ TIFFCreateDirectory(tif) ]
+	Set fields with calls to TIFFSetField(tif, ...)
+	TIFFDeferStrileArrayWriting(tif)
+	TIFFWriteCheck(tif, ...)
+	 TIFFWriteDirectory(tif)
+	 ... potentially create other directories and come back to the above directory
+	TIFFForceStrileArrayWriting(tif): emit the arrays at the end of file
+
+	See test/defer_strile_writing.c for a practical example.
+
+2019-05-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix vulnerability introduced by defer strile loading (master only)
+	Found on GDAL with https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=14894
+	Disabling the TIFF_DEFERSTRILELOAD bit in ChopupStripArray() was a
+	bad idea since when using TIFFReadDirectory() to reload the directory again
+	would lead to a different value of td_rowsperstrip, which could confuse
+	readers if they relied on the value found initially.
+
+	Fix typo in error message (master only)
+
+2019-05-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Add TIFFReadFromUserBuffer()
+	This function replaces the use of TIFFReadEncodedStrip()/TIFFReadEncodedTile()
+	when the user can provide the buffer for the input data, for example when
+	he wants to avoid libtiff to read the strile offset/count values from the
+	[Strip|Tile][Offsets/ByteCounts] array.
+
+	libtiff.def: add missing new symbols.
+
+	test/defer_strile_loading.c: fix warning with Visual C++
+
+	_TIFFRewriteField(): fix for bigtiff case (master only)
+	116cf67f4c59196605abdb244657c3070c4310af made StripByteCount/TileByteCount to
+	always be rewritten as TIFF_LONG8.
+
+2019-05-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ondemand_strile_offbytecount_loading' into 'master'
+	Make defer strile offset/bytecount loading available at runtime
+
+	See merge request libtiff/libtiff!79
+
+2019-05-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bigtiff_write_bytecount_on_long_when_possible' into 'master'
+	Create TileByteCounts/StripByteCounts tag with SHORT (ClassicTIFF/BigTIFF) or  LONG (BigTIFF) type when possible
+
+	See merge request libtiff/libtiff!78
+
+2019-05-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'html_link' into 'master'
+	libtiff.html, bigtiffpr.html: absolute => relative link
+
+	See merge request libtiff/libtiff!80
+
+2019-05-14  Thomas Bernard  <miniupnp@free.fr>
+
+	libtiff.html, bigtiffpr.html: absolute => relative link.
+
+2019-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Make defer strile offset/bytecount loading available at runtime.
+	... and add per-strile offset/bytecount loading capabilities.
+
+	Part of this commit makes the behaviour that was previously met when
+	libtiff was compiled with -DDEFER_STRILE_LOAD available for default builds
+	when specifying the new 'D' (Deferred) TIFFOpen() flag. In that mode, the [Tile/Strip][ByteCounts/Offsets]
+	arrays are only loaded when first accessed. This can speed-up the opening
+	of files stored on the network when just metadata retrieval is needed.
+	This mode has been used for years by the GDAL library when compiled with
+	its embeded libtiff copy.
+
+	To avoid potential out-of-tree code (typically codecs) that would use
+	the td_stripbytecount and td_stripoffset array inconditionnaly assuming they
+	have been loaded, those have been suffixed with _p (for protected). The
+	use of the new functions mentionned below is then recommended.
+
+	Another addition of this commit is the capability of loading only the
+	values of the offset/bytecount of the strile of interest instead of the
+	whole array. This is enabled with the new 'O' (Ondemand) flag of TIFFOpen()
+	(which implies 'D'). That behaviour has also been used by GDAL, which hacked
+	into the td_stripoffset/td_stripbytecount arrays directly. The new code
+	added in the _TIFFFetchStrileValue() and _TIFFPartialReadStripArray() internal
+	functions is mostly a port of what was in GDAL GTiff driver previously.
+
+	Related to that, the public TIFFGetStrileOffset[WithErr]() and TIFFGetStrileByteCount[WithErr]()
+	functions have been added to API. They are of particular interest when
+	using sparse files (with offset == bytecount == 0) and you want to detect
+	if a strile is present or not without decompressing the data, or updating
+	an existing sparse file.
+	They will also be used to enable a future enhancement where client code can entirely
+	skip bytecount loading in some situtations
+
+	A new test/defer_strile_loading.c test has been added to test the above
+	capabilities.
+
+2019-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Creation: use SHORT type when possible for StripByteCounts/TileByteCounts
+	This follows the same logic as previous commit.
+
+2019-05-09  Even Rouault  <even.rouault@spatialys.com>
+
+	BigTIFF creation: write TileByteCounts/StripByteCounts tag with LONG when possible
+	In most situations of BigTIFF file, the tile/strip sizes are of reasonable size,
+	that is they fit on a 4-byte LONG. So in that case, use LONG instead of LONG8
+	to save some space. For uncompressed file, it is easy to detect such situations
+	by checking at the TIFFTileSize64()/TIFFStripSize64() return. For compressed file,
+	we must take into account the fact that compression may sometimes result in
+	larger compressed data. So we allow this optimization only for a few select
+	compression times, and take a huge security margin (10x factor). We also only
+	apply this optimization on multi-strip files, so as to allow easy on-the-fly
+	growing of single-strip files whose strip size could grow above the 4GB threshold.
+
+	This change is compatible with the BigTIFF specification. According to
+	https://www.awaresystems.be/imaging/tiff/bigtiff.html:
+	"The StripOffsets, StripByteCounts, TileOffsets, and TileByteCounts tags are
+	allowed to have the datatype TIFF_LONG8 in BigTIFF. Old datatypes TIFF_LONG,
+	and TIFF_SHORT where allowed in the TIFF 6.0 specification, are still valid in BigTIFF, too. "
+	On a practical point of view, this is also compatible on reading/writing of
+	older libtiff 4.X versions.
+
+	The only glitch I found, which is rather minor, is when using such a BigTIFF
+	file with TileByteCounts/StripByteCounts written with TIFF_LONG, and updating
+	it with an older libtiff 4.X version with a change in the
+	[Tile/Strip][ByteCounts/Offsets] array. In that case the _TIFFRewriteField()
+	function will rewrite the directory and array with TIFF_LONG8, instead of updating
+	the existing array (this is an issue fixed by this commit). The file will
+	still be valid however, hence the minor severity of this.
+
+2019-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2799' into 'master'
+	fix fax2tiff
+
+	See merge request libtiff/libtiff!55
+
+2019-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug_2829' into 'master'
+	WIN32: use tif_win32.c when building with CMake
+
+	See merge request libtiff/libtiff!75
+
+2019-05-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'FILESOURCE_SCENETYPE_reading' into 'master'
+	Reading error for FileSource and SceneType tags corrected.
+
+	See merge request libtiff/libtiff!76
+
+2019-05-06  Su Laus  <sulau@freenet.de>
+
+	Reading error for FileSource and SceneType tags corrected.
+	EXIF tags FILESOURCE and SCENETYPE are defined as TIFF_UNDEFINED and field_readcount==1!
+	There is a bug in TIFFReadDirEntryByte() preventing to read correctly type TIFF_UNDEFINED fields with field_readcount==1
+	Upgrade of TIFFReadDirEntryByte() with added TIFF_UNDEFINED switch-entry allows libtiff to read those tags correctly.
+
+2019-04-25  Thomas Bernard  <miniupnp@free.fr>
+
+	WIN32: use tif_win32.c when building with CMake.
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2829
+
+	the top CMakeLists.txt defines
+	win32_io and USE_WIN32_FILEIO
+
+	WIN32_IO is defined nowhere in CMake (only in automake things)
+
+2019-04-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'gitlab_pages' into 'master'
+	Advertise https://libtiff.gitlab.io/libtiff/ as mirror
+
+	See merge request libtiff/libtiff!70
+
+2019-04-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug_2844' into 'master'
+	tiff2ps.c: PSDataColorContig(): avoid heap buffer overrun
+
+	See merge request libtiff/libtiff!69
+
+2019-04-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue_2785' into 'master'
+	tiff2pdf.c: don't call t2p_tile_collapse_left() for Ycbcr
+
+	See merge request libtiff/libtiff!64
+
+2019-04-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_gdal_1439' into 'master'
+	TIFFWriteEncodedStrip/TIFFWriteEncodedTile: fix rewriting of LZW-compressed data
+
+	See merge request libtiff/libtiff!74
+
+2019-04-11  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFWriteEncodedStrip/TIFFWriteEncodedTile: fix rewriting of LZW-compressed data
+	Fixes https://github.com/OSGeo/gdal/issues/1439
+
+	When rewriting a LZW tile/strip whose existing size is very close to a multiple of
+	1024 bytes (and larger than 8192 bytes) with compressed data that is larger,
+	the new data was not placed at the end of the file, causing corruption.
+
+2019-04-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2848' into 'master'
+	tif_luv.c: LogLuvSetupEncode() error must return 0
+
+	See merge request libtiff/libtiff!72
+
+2019-04-03  Thomas Bernard  <miniupnp@free.fr>
+
+	build/gitlab-ci: fix typo.
+
+	show test-suite.log in gitlab-ci.
+	useful when build fails
+
+	Add output check for tiff2ps.
+	note : the reference files have been generated in master branch
+
+2019-03-23  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_read.c: potentially fix false positive from Coverity Scan. CID 1400288
+
+	tif_read.c: potentially fix false positive from Coverity Scan. CID 1400271
+
+	tif_zip.c: remove dead code. CID 1400360.
+
+	tif_webp.c: remove false positive warning about dereference before null check. CID 1400255
+
+	tif_pixarlog.c: remove dead code. CID 1400342.
+
+	tif_pixarlog.c: avoid false positive Coverity Scan warnings about overflow. CID 1400300 and 1400367
+
+	tif_lzw.c: silence CoverityScan false positive. CID 1400355.
+
+	tif_luv.c: silence CoverityScan false positive. CID 1400231, 1400251, 1400254, 1400272, 1400318, 1400356
+
+	TryChopUpUncompressedBigTiff(): avoid potential division by zero. master only. GDAL Coverity CID 1400263
+
+2019-03-22  Thomas Bernard  <miniupnp@free.fr>
+
+	tif_luv.c: LogLuvSetupEncode() error must return 0.
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2848
+
+	if wrongly returning 1, the processing of incorrect file continues,
+	which causes problems.
+
+2019-03-22  Thomas Bernard  <miniupnp@free.fr>
+
+	add a test for fax2tiff tool.
+
+2019-02-28  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf.c: don't call t2p_tile_collapse_left() when buffer size is wrong
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2785
+
+	Advertise https://libtiff.gitlab.io/libtiff/ as mirror.
+	I'm put it above the maptools.org mirror because
+	Even Rouault believe at some point it will be completely removed
+
+2019-02-28  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug_2826' into 'master'
+	tiff2pdf.c: check colormap pointers when loading CMYK with colormap
+
+	See merge request libtiff/libtiff!65
+
+2019-02-28  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2pdf.c: check colormap pointers.
+	Avoid access to non initialized pointers
+	http://bugzilla.maptools.org/show_bug.cgi?id=2826
+
+2019-02-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_warnings' into 'master'
+	tiff2ps.c: fix warning caused by integer promotion
+
+	See merge request libtiff/libtiff!68
+
+2019-02-23  Thomas Bernard  <miniupnp@free.fr>
+
+	PSDataColorContig(): avoid heap buffer overrun.
+	fixes http://bugzilla.maptools.org/show_bug.cgi?id=2844
+	each iteration of the loop read nc bytes
+
+2019-02-22  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps.c: fix warning caused by integer promotion.
+	uint8 value is promoted to int in (value << 24) so -fsanitize
+	yield runtime errors :
+	tiff2ps.c:2969:33: runtime error: left shift of 246 by 24 places cannot be represented in type 'int'
+
+2019-02-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'large_strile_improvements' into 'master'
+	Large strile support improvements
+
+	See merge request libtiff/libtiff!63
+
+2019-02-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'gitlab-pages' into 'master'
+	ci: Add pages job
+
+	See merge request libtiff/libtiff!45
+
+2019-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue_2833' into 'master'
+	tiffcp.c: check that (Tile Width)*(Samples/Pixel) do no overflow
+
+	See merge request libtiff/libtiff!60
+
+2019-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue_2831' into 'master'
+	tiffcrop.c: fix invertImage() for bps 2 and 4
+
+	See merge request libtiff/libtiff!61
+
+2019-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue_2842' into 'master'
+	move _TIFFClampDoubleToFloat() to tif_aux.c
+
+	See merge request libtiff/libtiff!62
+
+2019-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_zip.c: allow reading and writing strips/tiles with more than 4 GB of compressed or uncompressed data
+
+	tif_dirread.c: when strip chopping is enabled, extend this mechanism to multi-strip uncompressed files with strips larger than 2GB to expose them as strips of ~500 MB
+
+2019-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'size_t_typo' into 'master'
+	CMakeLists.txt: fix TIFF_SIZE_T
+
+	See merge request libtiff/libtiff!59
+
+2019-02-12  Thomas Bernard  <miniupnp@free.fr>
+
+	move _TIFFClampDoubleToFloat() to tif_aux.c.
+	the same function was declared in tif_dir.c and tif_dirwrite.c
+
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2842
+
+2019-02-11  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop.c: fix invertImage() for bps 2 and 4.
+	too much bytes were processed, causing a heap buffer overrun
+	    http://bugzilla.maptools.org/show_bug.cgi?id=2831
+	the loop counter must be
+	    for (col = 0; col < width; col += 8 / bps)
+
+	Also the values were not properly calculated. It should be
+	255-x, 15-x, 3-x for bps 8, 4, 2.
+
+	But anyway it is easyer to invert all bits as 255-x = ~x, etc.
+	(substracting from a binary number composed of all 1 is like inverting
+	the bits)
+
+2019-02-11  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcp.c: use INT_MAX.
+
+	check that (Tile Width)*(Samples/Pixel) do no overflow.
+	fixes bug 2833
+
+2019-02-03  Thomas Bernard  <miniupnp@free.fr>
+
+	CMakeLists.txt: fix TIFF_SIZE_T.
+
+2019-02-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	Fix for simple memory leak that was assigned CVE-2019-6128.
+
+	See merge request libtiff/libtiff!50
+
+2019-02-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2835' into 'master'
+	tiff2ps: fix heap-buffer-overflow
+
+	See merge request libtiff/libtiff!53
+
+2019-02-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix warning (use of uninitialized value) added per d0a842c5dbad2609aed43c701a12ed12461d3405 (fixes https://gitlab.com/libtiff/libtiff/merge_requests/54#note_137742985)
+
+2019-02-02  Yuri Aksenov  <yuri.aksenov@gmail.com>
+
+	fix fax2tiff.
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2799
+	fixes d9bc8472e72549f29c0062c1cbd3d56f279f3be2
+
+2019-02-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop' into 'master'
+	tiffcrop: shut up clang warnings
+
+	See merge request libtiff/libtiff!52
+
+2019-02-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bug2833' into 'master'
+	TIFFWriteDirectoryTagTransferfunction() : fix NULL dereferencing
+
+	See merge request libtiff/libtiff!54
+
+2019-02-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'gitignore' into 'master'
+	add test/ files to .gitignore
+
+	See merge request libtiff/libtiff!56
+
+2019-02-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	tif_dir: unset transferfunction field if necessary (CVE-2018-19210)
+
+	See merge request libtiff/libtiff!47
+
+2019-01-29  Thomas Bernard  <miniupnp@free.fr>
+
+	add test/ files to .gitignore.
+
+2019-01-29  Thomas Bernard  <miniupnp@free.fr>
+
+	TIFFWriteDirectoryTagTransferfunction() : fix NULL dereferencing.
+	http://bugzilla.maptools.org/show_bug.cgi?id=2833
+
+	we must check the pointer is not NULL before memcmp() the memory
+
+2019-01-29  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps: fix heap-buffer-overflow.
+	http://bugzilla.maptools.org/show_bug.cgi?id=2834
+
+	usually the test (i < byte_count) is OK because the byte_count is divisible by samplesperpixel.
+	But if that is not the case, (i + ncomps) < byte_count should be used, or
+	maybe (i + samplesperpixel) <= byte_count
+
+2019-01-28  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcrop: shut up clang warnings.
+	make the out filename building a bit more simple
+	and remove the use of strcat()
+
+2019-01-23  Scott Gayou  <github.scott@gmail.com>
+
+	Fix for simple memory leak that was assigned CVE-2019-6128.
+	pal2rgb failed to free memory on a few errors. This was reported
+	here: http://bugzilla.maptools.org/show_bug.cgi?id=2836.
+
+2019-01-05  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Fix tiff2ps error regarding "Inconsistent value of es" by allowing es to be zero. Problem was reported to the tiff mailing list by Julian H. Stacey on January 5, 2019.
+
+2018-12-13  Hugo Lefeuvre  <hle@debian.org>
+
+	tif_dir: unset transferfunction field if necessary.
+	The number of entries in the transfer table is determined as following:
+
+	(td->td_samplesperpixel - td->td_extrasamples) > 1 ? 3 : 1
+
+	This means that whenever td->td_samplesperpixel or td->td_extrasamples are
+	modified we also need to make sure that the number of required entries in
+	the transfer table didn't change.
+
+	If it changed and the number of entries is higher than before we should
+	invalidate the transfer table field and free previously allocated values.
+	In the other case there's nothing to do, additional tf entries won't harm
+	and properly written code will just ignore them since spp - es < 1.
+
+	For instance this situation might happen when reading an OJPEG compressed
+	image with missing SamplesPerPixel tag. In this case the SamplesPerPixel
+	field might be updated after setting the transfer table.
+
+	see http://bugzilla.maptools.org/show_bug.cgi?id=2500
+
+	This commit addresses CVE-2018-19210.
+
+2018-12-08  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Do not attempt to re-sync zip stream after reported data error from inflate().
+
+2018-12-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'resource-leaks' into 'master'
+	Fix two resource leaks
+
+	See merge request libtiff/libtiff!43
+
+2018-12-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'build-jbig' into 'master'
+	add jbig support to the fuzzer
+
+	See merge request libtiff/libtiff!42
+
+2018-12-01  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	tiffcrop.c: Avoid new clang warning about tools/tiffcrop.c "size argument in 'strncat' call appears to be size of the source".
+
+2018-11-28  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'webp_memleak' into 'master'
+	fixed mem leak in webp compression
+
+	See merge request libtiff/libtiff!48
+
+2018-11-28  Norman Barker  <norman.barker@mapbox.com>
+
+	fixed mem leak in webp compression.
+
+2018-11-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lossless_webp' into 'master'
+	fixed lossless webp compression config
+
+	See merge request libtiff/libtiff!46
+
+2018-11-20  Norman Barker  <norman.barker@mapbox.com>
+
+	fixed lossless webp compression config.
+
+2018-11-18  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	snprintf porting fix for Visual Studio 2003.
+
+2018-11-18  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Add pages job.
+
+2018-11-10  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Change references from defunct ftp site to https site.
+
 2018-11-10  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
 
 	* configure.ac: libtiff 4.0.10 released.
diff --git a/3rdparty/libtiff/libport.h b/3rdparty/libtiff/libport.h
index ff26263829..9f2dace144 100644
--- a/3rdparty/libtiff/libport.h
+++ b/3rdparty/libtiff/libport.h
@@ -24,6 +24,10 @@
 #ifndef _LIBPORT_
 #define	_LIBPORT_
 
+#if defined(HAVE_CONFIG_H)
+#  include <tif_config.h>
+#endif
+
 int getopt(int argc, char * const argv[], const char *optstring);
 extern   char *optarg;
 extern   int opterr;
@@ -36,16 +40,16 @@ int strcasecmp(const char *s1, const char *s2);
 #  define HAVE_GETOPT 1
 #endif
 
-#if HAVE_STRTOL
+#if !defined(HAVE_STRTOL)
 long strtol(const char *nptr, char **endptr, int base);
 #endif
-#if HAVE_STRTOLL
+#if !defined(HAVE_STRTOLL)
 long long strtoll(const char *nptr, char **endptr, int base);
 #endif
-#if HAVE_STRTOUL
+#if !defined(HAVE_STRTOUL)
 unsigned long strtoul(const char *nptr, char **endptr, int base);
 #endif
-#if HAVE_STRTOULL
+#if !defined(HAVE_STRTOULL)
 unsigned long long strtoull(const char *nptr, char **endptr, int base);
 #endif
 
diff --git a/3rdparty/libtiff/tif_aux.c b/3rdparty/libtiff/tif_aux.c
index 90d30214c6..c9f190545e 100644
--- a/3rdparty/libtiff/tif_aux.c
+++ b/3rdparty/libtiff/tif_aux.c
@@ -35,27 +35,61 @@
 uint32
 _TIFFMultiply32(TIFF* tif, uint32 first, uint32 second, const char* where)
 {
-	uint32 bytes = first * second;
-
-	if (second && bytes / second != first) {
+	if (second && first > TIFF_UINT32_MAX / second) {
 		TIFFErrorExt(tif->tif_clientdata, where, "Integer overflow in %s", where);
-		bytes = 0;
+		return 0;
 	}
 
-	return bytes;
+	return first * second;
 }
 
 uint64
 _TIFFMultiply64(TIFF* tif, uint64 first, uint64 second, const char* where)
 {
-	uint64 bytes = first * second;
-
-	if (second && bytes / second != first) {
+	if (second && first > TIFF_UINT64_MAX / second) {
 		TIFFErrorExt(tif->tif_clientdata, where, "Integer overflow in %s", where);
-		bytes = 0;
+		return 0;
 	}
 
-	return bytes;
+	return first * second;
+}
+
+tmsize_t
+_TIFFMultiplySSize(TIFF* tif, tmsize_t first, tmsize_t second, const char* where)
+{
+    if( first <= 0 || second <= 0 )
+    {
+        if( tif != NULL && where != NULL )
+        {
+            TIFFErrorExt(tif->tif_clientdata, where,
+                        "Invalid argument to _TIFFMultiplySSize() in %s", where);
+        }
+        return 0;
+    }
+
+    if( first > TIFF_TMSIZE_T_MAX / second )
+    {
+        if( tif != NULL && where != NULL )
+        {
+            TIFFErrorExt(tif->tif_clientdata, where,
+                        "Integer overflow in %s", where);
+        }
+        return 0;
+    }
+    return first * second;
+}
+
+tmsize_t _TIFFCastUInt64ToSSize(TIFF* tif, uint64 val, const char* module)
+{
+    if( val > (uint64)TIFF_TMSIZE_T_MAX )
+    {
+        if( tif != NULL && module != NULL )
+        {
+            TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
+        }
+        return 0;
+    }
+    return (tmsize_t)val;
 }
 
 void*
@@ -63,13 +97,14 @@ _TIFFCheckRealloc(TIFF* tif, void* buffer,
 		  tmsize_t nmemb, tmsize_t elem_size, const char* what)
 {
 	void* cp = NULL;
-	tmsize_t bytes = nmemb * elem_size;
-
+        tmsize_t count = _TIFFMultiplySSize(tif, nmemb, elem_size, NULL);
 	/*
-	 * XXX: Check for integer overflow.
+	 * Check for integer overflow.
 	 */
-	if (nmemb && elem_size && bytes / elem_size == nmemb)
-		cp = _TIFFrealloc(buffer, bytes);
+	if (count != 0)
+	{
+		cp = _TIFFrealloc(buffer, count);
+	}
 
 	if (cp == NULL) {
 		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
@@ -235,7 +270,7 @@ TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap)
 		return (1);
 	case TIFFTAG_EXTRASAMPLES:
 		*va_arg(ap, uint16 *) = td->td_extrasamples;
-		*va_arg(ap, uint16 **) = td->td_sampleinfo;
+		*va_arg(ap, const uint16 **) = td->td_sampleinfo;
 		return (1);
 	case TIFFTAG_MATTEING:
 		*va_arg(ap, uint16 *) =
@@ -257,8 +292,8 @@ TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap)
 	case TIFFTAG_YCBCRCOEFFICIENTS:
 		{
 			/* defaults are from CCIR Recommendation 601-1 */
-			static float ycbcrcoeffs[] = { 0.299f, 0.587f, 0.114f };
-			*va_arg(ap, float **) = ycbcrcoeffs;
+			static const float ycbcrcoeffs[] = { 0.299f, 0.587f, 0.114f };
+			*va_arg(ap, const float **) = ycbcrcoeffs;
 			return 1;
 		}
 	case TIFFTAG_YCBCRSUBSAMPLING:
@@ -270,14 +305,14 @@ TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap)
 		return (1);
 	case TIFFTAG_WHITEPOINT:
 		{
-			static float whitepoint[2];
-
 			/* TIFF 6.0 specification tells that it is no default
 			   value for the WhitePoint, but AdobePhotoshop TIFF
 			   Technical Note tells that it should be CIE D50. */
-			whitepoint[0] =	D50_X0 / (D50_X0 + D50_Y0 + D50_Z0);
-			whitepoint[1] =	D50_Y0 / (D50_X0 + D50_Y0 + D50_Z0);
-			*va_arg(ap, float **) = whitepoint;
+			static const float whitepoint[] = {
+						D50_X0 / (D50_X0 + D50_Y0 + D50_Z0),
+						D50_Y0 / (D50_X0 + D50_Y0 + D50_Z0)
+			};
+			*va_arg(ap, const float **) = whitepoint;
 			return 1;
 		}
 	case TIFFTAG_TRANSFERFUNCTION:
@@ -286,16 +321,16 @@ TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap)
 			TIFFErrorExt(tif->tif_clientdata, tif->tif_name, "No space for \"TransferFunction\" tag");
 			return (0);
 		}
-		*va_arg(ap, uint16 **) = td->td_transferfunction[0];
+		*va_arg(ap, const uint16 **) = td->td_transferfunction[0];
 		if (td->td_samplesperpixel - td->td_extrasamples > 1) {
-			*va_arg(ap, uint16 **) = td->td_transferfunction[1];
-			*va_arg(ap, uint16 **) = td->td_transferfunction[2];
+			*va_arg(ap, const uint16 **) = td->td_transferfunction[1];
+			*va_arg(ap, const uint16 **) = td->td_transferfunction[2];
 		}
 		return (1);
 	case TIFFTAG_REFERENCEBLACKWHITE:
 		if (!td->td_refblackwhite && !TIFFDefaultRefBlackWhite(td))
 			return (0);
-		*va_arg(ap, float **) = td->td_refblackwhite;
+		*va_arg(ap, const float **) = td->td_refblackwhite;
 		return (1);
 	}
 	return 0;
diff --git a/3rdparty/libtiff/tif_compress.c b/3rdparty/libtiff/tif_compress.c
index 8130ef08ef..915478f500 100644
--- a/3rdparty/libtiff/tif_compress.c
+++ b/3rdparty/libtiff/tif_compress.c
@@ -264,7 +264,7 @@ TIFFGetConfiguredCODECs()
 			return NULL;
 		}
 		codecs = new_codecs;
-		_TIFFmemcpy(codecs + i - 1, cd, sizeof(TIFFCodec));
+		_TIFFmemcpy(codecs + i - 1, cd->info, sizeof(TIFFCodec));
 		i++;
 	}
 	for (c = _TIFFBuiltinCODECS; c->name; c++) {
diff --git a/3rdparty/libtiff/tif_dir.c b/3rdparty/libtiff/tif_dir.c
index b4ecd44f95..347b7115cb 100644
--- a/3rdparty/libtiff/tif_dir.c
+++ b/3rdparty/libtiff/tif_dir.c
@@ -29,6 +29,7 @@
  * (and also some miscellaneous stuff)
  */
 #include "tiffiop.h"
+#include <float.h>	/*--: for Rational2Double */
 
 /*
  * These are used in the backwards compatibility code...
@@ -46,8 +47,8 @@ setByteArray(void** vpp, void* vp, size_t nmemb, size_t elem_size)
 		*vpp = 0;
 	}
 	if (vp) {
-		tmsize_t bytes = (tmsize_t)(nmemb * elem_size);
-		if (elem_size && bytes / elem_size == nmemb)
+		tmsize_t bytes = _TIFFMultiplySSize(NULL, nmemb, elem_size, NULL);
+		if (bytes)
 			*vpp = (void*) _TIFFmalloc(bytes);
 		if (*vpp)
 			_TIFFmemcpy(*vpp, vp, bytes);
@@ -123,7 +124,7 @@ setExtraSamples(TIFF* tif, va_list ap, uint32* v)
         {
                 TIFFWarningExt(tif->tif_clientdata,module,
                     "ExtraSamples tag value is changing, "
-                    "but TransferFunction was read with a different value. Cancelling it");
+                    "but TransferFunction was read with a different value. Canceling it");
                 TIFFClrFieldBit(tif,FIELD_TRANSFERFUNCTION);
                 _TIFFfree(td->td_transferfunction[0]);
                 td->td_transferfunction[0] = NULL;
@@ -205,7 +206,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 		/*
 		 * If the data require post-decoding processing to byte-swap
 		 * samples, set it up here.  Note that since tags are required
-		 * to be ordered, compression code can override this behaviour
+		 * to be ordered, compression code can override this behavior
 		 * in the setup method if it wants to roll the post decoding
 		 * work in with its normal work.
 		 */
@@ -275,7 +276,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
             {
                 TIFFWarningExt(tif->tif_clientdata,module,
                     "SamplesPerPixel tag value is changing, "
-                    "but SMinSampleValue tag was read with a different value. Cancelling it");
+                    "but SMinSampleValue tag was read with a different value. Canceling it");
                 TIFFClrFieldBit(tif,FIELD_SMINSAMPLEVALUE);
                 _TIFFfree(td->td_sminsamplevalue);
                 td->td_sminsamplevalue = NULL;
@@ -284,7 +285,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
             {
                 TIFFWarningExt(tif->tif_clientdata,module,
                     "SamplesPerPixel tag value is changing, "
-                    "but SMaxSampleValue tag was read with a different value. Cancelling it");
+                    "but SMaxSampleValue tag was read with a different value. Canceling it");
                 TIFFClrFieldBit(tif,FIELD_SMAXSAMPLEVALUE);
                 _TIFFfree(td->td_smaxsamplevalue);
                 td->td_smaxsamplevalue = NULL;
@@ -296,7 +297,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
             {
                     TIFFWarningExt(tif->tif_clientdata,module,
                         "SamplesPerPixel tag value is changing, "
-                        "but TransferFunction was read with a different value. Cancelling it");
+                        "but TransferFunction was read with a different value. Canceling it");
                     TIFFClrFieldBit(tif,FIELD_TRANSFERFUNCTION);
                     _TIFFfree(td->td_transferfunction[0]);
                     td->td_transferfunction[0] = NULL;
@@ -393,7 +394,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 			if (tif->tif_mode != O_RDONLY)
 				goto badvalue32;
 			TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-				"Nonstandard tile width %d, convert file", v32);
+				"Nonstandard tile width %u, convert file", v32);
 		}
 		td->td_tilewidth = v32;
 		tif->tif_flags |= TIFF_ISTILED;
@@ -404,7 +405,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 			if (tif->tif_mode != O_RDONLY)
 				goto badvalue32;
 			TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-			    "Nonstandard tile length %d, convert file", v32);
+			    "Nonstandard tile length %u, convert file", v32);
 		}
 		td->td_tilelength = v32;
 		tif->tif_flags |= TIFF_ISTILED;
@@ -559,6 +560,10 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 		 * Set custom value ... save a copy of the custom tag value.
 		 */
 		tv_size = _TIFFDataSize(fip->field_type);
+		/*--: Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
+		if (fip->field_type == TIFF_RATIONAL || fip->field_type == TIFF_SRATIONAL) {
+			tv_size = _TIFFSetGetFieldSize(fip->set_field_type);
+		}
 		if (tv_size == 0) {
 			status = 0;
 			TIFFErrorExt(tif->tif_clientdata, module,
@@ -638,6 +643,7 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 				  || fip->field_writecount == TIFF_VARIABLE2
 				  || fip->field_writecount == TIFF_SPP
 				  || tv->count > 1) {
+			  /*--: Rational2Double: For Rationals tv_size is set above to 4 or 8 according to fip->set_field_type! */
 				_TIFFmemcpy(tv->value, va_arg(ap, void *),
 				    tv->count * tv_size);
 			} else {
@@ -698,6 +704,22 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
 					break;
 				case TIFF_RATIONAL:
 				case TIFF_SRATIONAL:
+					/*-- Rational2Double: For Rationals tv_size is set above to 4 or 8 according to fip->set_field_type! */
+					{
+						if (tv_size == 8) {
+							double v2 = va_arg(ap, double);
+							_TIFFmemcpy(val, &v2, tv_size);
+						} else {
+							/*-- default should be tv_size == 4 */
+							float v3 = (float)va_arg(ap, double);
+							_TIFFmemcpy(val, &v3, tv_size);
+							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
+							if (tv_size != 4) {
+								TIFFErrorExt(0,"TIFFLib: _TIFFVSetField()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
+							}
+						}
+					}
+					break;
 				case TIFF_FLOAT:
 					{
 						float v2 = _TIFFClampDoubleToFloat(va_arg(ap, double));
@@ -1011,19 +1033,19 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 			*va_arg(ap, uint16*) = td->td_halftonehints[1];
 			break;
 		case TIFFTAG_COLORMAP:
-			*va_arg(ap, uint16**) = td->td_colormap[0];
-			*va_arg(ap, uint16**) = td->td_colormap[1];
-			*va_arg(ap, uint16**) = td->td_colormap[2];
+			*va_arg(ap, const uint16**) = td->td_colormap[0];
+			*va_arg(ap, const uint16**) = td->td_colormap[1];
+			*va_arg(ap, const uint16**) = td->td_colormap[2];
 			break;
 		case TIFFTAG_STRIPOFFSETS:
 		case TIFFTAG_TILEOFFSETS:
 			_TIFFFillStriles( tif );
-			*va_arg(ap, uint64**) = td->td_stripoffset;
+			*va_arg(ap, const uint64**) = td->td_stripoffset_p;
 			break;
 		case TIFFTAG_STRIPBYTECOUNTS:
 		case TIFFTAG_TILEBYTECOUNTS:
 			_TIFFFillStriles( tif );
-			*va_arg(ap, uint64**) = td->td_stripbytecount;
+			*va_arg(ap, const uint64**) = td->td_stripbytecount_p;
 			break;
 		case TIFFTAG_MATTEING:
 			*va_arg(ap, uint16*) =
@@ -1032,7 +1054,7 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 			break;
 		case TIFFTAG_EXTRASAMPLES:
 			*va_arg(ap, uint16*) = td->td_extrasamples;
-			*va_arg(ap, uint16**) = td->td_sampleinfo;
+			*va_arg(ap, const uint16**) = td->td_sampleinfo;
 			break;
 		case TIFFTAG_TILEWIDTH:
 			*va_arg(ap, uint32*) = td->td_tilewidth;
@@ -1067,7 +1089,7 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 			break;
 		case TIFFTAG_SUBIFD:
 			*va_arg(ap, uint16*) = td->td_nsubifd;
-			*va_arg(ap, uint64**) = td->td_subifd;
+			*va_arg(ap, const uint64**) = td->td_subifd;
 			break;
 		case TIFFTAG_YCBCRPOSITIONING:
 			*va_arg(ap, uint16*) = td->td_ycbcrpositioning;
@@ -1077,20 +1099,20 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 			*va_arg(ap, uint16*) = td->td_ycbcrsubsampling[1];
 			break;
 		case TIFFTAG_TRANSFERFUNCTION:
-			*va_arg(ap, uint16**) = td->td_transferfunction[0];
+			*va_arg(ap, const uint16**) = td->td_transferfunction[0];
 			if (td->td_samplesperpixel - td->td_extrasamples > 1) {
-				*va_arg(ap, uint16**) = td->td_transferfunction[1];
-				*va_arg(ap, uint16**) = td->td_transferfunction[2];
+				*va_arg(ap, const uint16**) = td->td_transferfunction[1];
+				*va_arg(ap, const uint16**) = td->td_transferfunction[2];
 			} else {
-				*va_arg(ap, uint16**) = NULL;
-				*va_arg(ap, uint16**) = NULL;
+				*va_arg(ap, const uint16**) = NULL;
+				*va_arg(ap, const uint16**) = NULL;
 			}
 			break;
 		case TIFFTAG_REFERENCEBLACKWHITE:
-			*va_arg(ap, float**) = td->td_refblackwhite;
+			*va_arg(ap, const float**) = td->td_refblackwhite;
 			break;
 		case TIFFTAG_INKNAMES:
-			*va_arg(ap, char**) = td->td_inknames;
+			*va_arg(ap, const char**) = td->td_inknames;
 			break;
 		default:
 			{
@@ -1132,7 +1154,7 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 							*va_arg(ap, uint32*) = (uint32)tv->count;
 						else  /* Assume TIFF_VARIABLE */
 							*va_arg(ap, uint16*) = (uint16)tv->count;
-						*va_arg(ap, void **) = tv->value;
+						*va_arg(ap, const void **) = tv->value;
 						ret_val = 1;
 					} else if (fip->field_tag == TIFFTAG_DOTRANGE
 						   && strcmp(fip->field_name,"DotRange") == 0) {
@@ -1200,6 +1222,23 @@ _TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
 								break;
 							case TIFF_RATIONAL:
 							case TIFF_SRATIONAL:
+								{
+									/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size and return value size. */
+									int tv_size = _TIFFSetGetFieldSize(fip->set_field_type);
+									if (tv_size == 8) {
+										*va_arg(ap, double*) = *(double *)val;
+										ret_val = 1;
+									} else {
+										/*-- default should be tv_size == 4  */
+										*va_arg(ap, float*) = *(float *)val;
+										ret_val = 1;
+										/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
+										if (tv_size != 4) {
+											TIFFErrorExt(0,"TIFFLib: _TIFFVGetField()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
+										}
+									}
+								}
+								break;
 							case TIFF_FLOAT:
 								*va_arg(ap, float*) =
 									*(float *)val;
@@ -1282,8 +1321,9 @@ TIFFFreeDirectory(TIFF* tif)
 	CleanupField(td_transferfunction[0]);
 	CleanupField(td_transferfunction[1]);
 	CleanupField(td_transferfunction[2]);
-	CleanupField(td_stripoffset);
-	CleanupField(td_stripbytecount);
+	CleanupField(td_stripoffset_p);
+	CleanupField(td_stripbytecount_p);
+        td->td_stripoffsetbyteallocsize = 0;
 	TIFFClrFieldBit(tif, FIELD_YCBCRSUBSAMPLING);
 	TIFFClrFieldBit(tif, FIELD_YCBCRPOSITIONING);
 
@@ -1296,10 +1336,8 @@ TIFFFreeDirectory(TIFF* tif)
 	td->td_customValueCount = 0;
 	CleanupField(td_customValues);
 
-#if defined(DEFER_STRILE_LOAD)
         _TIFFmemset( &(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
         _TIFFmemset( &(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
-#endif        
 }
 #undef CleanupField
 
@@ -1365,6 +1403,17 @@ TIFFCreateEXIFDirectory(TIFF* tif)
 	return TIFFCreateCustomDirectory(tif, exifFieldArray);
 }
 
+/*
+ * Creates the EXIF GPS custom directory 
+ */
+int
+TIFFCreateGPSDirectory(TIFF* tif)
+{
+	const TIFFFieldArray* gpsFieldArray;
+	gpsFieldArray = _TIFFGetGpsFields();
+	return TIFFCreateCustomDirectory(tif, gpsFieldArray);
+}
+
 /*
  * Setup a default directory structure.
  */
@@ -1387,7 +1436,9 @@ TIFFDefaultDirectory(TIFF* tif)
 	td->td_tilewidth = 0;
 	td->td_tilelength = 0;
 	td->td_tiledepth = 1;
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
 	td->td_stripbytecountsorted = 1; /* Our own arrays always sorted. */  
+#endif
 	td->td_resolutionunit = RESUNIT_INCH;
 	td->td_sampleformat = SAMPLEFORMAT_UINT;
 	td->td_imagedepth = 1;
diff --git a/3rdparty/libtiff/tif_dir.h b/3rdparty/libtiff/tif_dir.h
index b2f5e69488..f608dd713b 100644
--- a/3rdparty/libtiff/tif_dir.h
+++ b/3rdparty/libtiff/tif_dir.h
@@ -58,6 +58,7 @@ typedef struct {
 		uint32 toff_long;
 		uint64 toff_long8;
 	} tdir_offset;		/* either offset or the data itself if fits */
+	uint8  tdir_ignore;	/* flag status to ignore tag when parsing tags in tif_dirread.c */
 } TIFFDirEntry;
 
 /*
@@ -97,13 +98,14 @@ typedef struct {
 	 * number of striles */
 	uint32  td_stripsperimage;  
 	uint32  td_nstrips;              /* size of offset & bytecount arrays */
-	uint64* td_stripoffset;
-	uint64* td_stripbytecount;
+	uint64* td_stripoffset_p;        /* should be accessed with TIFFGetStrileOffset */
+	uint64* td_stripbytecount_p;     /* should be accessed with TIFFGetStrileByteCount */
+        uint32  td_stripoffsetbyteallocsize; /* number of elements currently allocated for td_stripoffset/td_stripbytecount. Only used if TIFF_LAZYSTRILELOAD is set */
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
 	int     td_stripbytecountsorted; /* is the bytecount array sorted ascending? */
-#if defined(DEFER_STRILE_LOAD)
+#endif
         TIFFDirEntry td_stripoffset_entry;    /* for deferred loading */
         TIFFDirEntry td_stripbytecount_entry; /* for deferred loading */
-#endif
 	uint16  td_nsubifd;
 	uint64* td_subifd;
 	/* YCbCr parameters */
@@ -118,6 +120,8 @@ typedef struct {
 
 	int     td_customValueCount;
         TIFFTagValue *td_customValues;
+
+        unsigned char td_deferstrilearraywriting; /* see TIFFDeferStrileArrayWriting() */
 } TIFFDirectory;
 
 /*
@@ -257,6 +261,7 @@ extern "C" {
 
 extern const TIFFFieldArray* _TIFFGetFields(void);
 extern const TIFFFieldArray* _TIFFGetExifFields(void);
+extern const TIFFFieldArray* _TIFFGetGpsFields(void);
 extern void _TIFFSetupFields(TIFF* tif, const TIFFFieldArray* infoarray);
 extern void _TIFFPrintFieldInfo(TIFF*, FILE*);
 
@@ -265,6 +270,7 @@ extern int _TIFFFillStriles(TIFF*);
 typedef enum {
 	tfiatImage,
 	tfiatExif,
+	tfiatGps,		/* EXIF-GPS fields array type */
 	tfiatOther
 } TIFFFieldArrayType;
 
diff --git a/3rdparty/libtiff/tif_dirinfo.c b/3rdparty/libtiff/tif_dirinfo.c
index e1f6b23e9a..7217042c25 100644
--- a/3rdparty/libtiff/tif_dirinfo.c
+++ b/3rdparty/libtiff/tif_dirinfo.c
@@ -47,9 +47,19 @@
 #endif
 static const TIFFFieldArray tiffFieldArray;
 static const TIFFFieldArray exifFieldArray;
+static const TIFFFieldArray gpsFieldArray;
 #ifdef _MSC_VER
 #pragma warning( pop )
 #endif
+/*--: Rational2Double: --
+ * The Rational2Double upgraded libtiff functionality allows the definition and achievement of true double-precision accuracy
+ * for TIFF tags of RATIONAL type and field_bit=FIELD_CUSTOM using the set_field_type = TIFF_SETGET_DOUBLE.
+ * Unfortunately, that changes the old implemented interface for TIFFGetField().
+ * In order to keep the old TIFFGetField() interface behavior those tags have to be redefined with set_field_type = TIFF_SETGET_FLOAT!
+ *
+ *  Rational custom arrays are already defined as _Cxx_FLOAT, thus can stay.
+ *
+ */
 
 static const TIFFField
 tiffFields[] = {
@@ -75,12 +85,12 @@ tiffFields[] = {
 	{ TIFFTAG_STRIPBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPBYTECOUNTS, 0, 0, "StripByteCounts", NULL },
 	{ TIFFTAG_MINSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MINSAMPLEVALUE, 1, 0, "MinSampleValue", NULL },
 	{ TIFFTAG_MAXSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MAXSAMPLEVALUE, 1, 0, "MaxSampleValue", NULL },
-	{ TIFFTAG_XRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "XResolution", NULL },
-	{ TIFFTAG_YRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "YResolution", NULL },
+	{ TIFFTAG_XRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "XResolution", NULL },
+	{ TIFFTAG_YRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "YResolution", NULL },
 	{ TIFFTAG_PLANARCONFIG, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_PLANARCONFIG, 0, 0, "PlanarConfiguration", NULL },
 	{ TIFFTAG_PAGENAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PageName", NULL },
-	{ TIFFTAG_XPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "XPosition", NULL },
-	{ TIFFTAG_YPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "YPosition", NULL },
+	{ TIFFTAG_XPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "XPosition", NULL },
+	{ TIFFTAG_YPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "YPosition", NULL },
 	{ TIFFTAG_FREEOFFSETS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeOffsets", NULL },
 	{ TIFFTAG_FREEBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeByteCounts", NULL },
 	{ TIFFTAG_GRAYRESPONSEUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "GrayResponseUnit", NULL },
@@ -135,14 +145,18 @@ tiffFields[] = {
 	{ TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToScreen", NULL },
 	{ TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToCamera", NULL },
 	{ TIFFTAG_CFAREPEATPATTERNDIM, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED,	FIELD_CUSTOM, 0,	0,	"CFARepeatPatternDim", NULL },
-	{ TIFFTAG_CFAPATTERN,	4, 4,	TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0,	0,	"CFAPattern" , NULL},
+	{ TIFFTAG_CFAPATTERN, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CFAPattern" , NULL},
 	{ TIFFTAG_COPYRIGHT, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Copyright", NULL },
 	/* end Pixar tags */
-	{ TIFFTAG_RICHTIFFIPTC, -3, -3, TIFF_LONG, 0, TIFF_SETGET_C32_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "RichTIFFIPTC", NULL },
+	{ TIFFTAG_RICHTIFFIPTC, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "RichTIFFIPTC", NULL },
 	{ TIFFTAG_PHOTOSHOP, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "Photoshop", NULL },
-	{ TIFFTAG_EXIFIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "EXIFIFDOffset", (TIFFFieldArray*) &exifFieldArray },
+	/*--: EXIFIFD and GPSIFD specified as TIFF_LONG by Aware-Systems and not TIFF_IFD8 as in original LibTiff.
+	 *    However, for IFD-like tags, libtiff uses the data type TIFF_IFD8 in tiffFields[]-tag definition combined with
+	 *    a special handling procedure in order to write either a 32-bit value and the TIFF_IFD type-id into ClassicTIFF files 
+	 *    or a 64-bit value and the TIFF_IFD8 type-id into BigTIFF files. */
+	{ TIFFTAG_EXIFIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EXIFIFDOffset", (TIFFFieldArray*) &exifFieldArray },
 	{ TIFFTAG_ICCPROFILE, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ICC Profile", NULL },
-	{ TIFFTAG_GPSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "GPSIFDOffset", NULL },
+	{ TIFFTAG_GPSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GPSIFDOffset", (TIFFFieldArray*) &gpsFieldArray },
 	{ TIFFTAG_FAXRECVPARAMS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvParams", NULL },
 	{ TIFFTAG_FAXSUBADDRESS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_ASCII, FIELD_CUSTOM, TRUE, FALSE, "FaxSubAddress", NULL },
 	{ TIFFTAG_FAXRECVTIME, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvTime", NULL },
@@ -163,7 +177,7 @@ tiffFields[] = {
 	{ TIFFTAG_BLACKLEVELDELTAV, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "BlackLevelDeltaV", NULL },
 	{ TIFFTAG_WHITELEVEL, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "WhiteLevel", NULL },
 	{ TIFFTAG_DEFAULTSCALE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultScale", NULL },
-	{ TIFFTAG_BESTQUALITYSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BestQualityScale", NULL },
+	{ TIFFTAG_BESTQUALITYSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BestQualityScale", NULL },
 	{ TIFFTAG_DEFAULTCROPORIGIN, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultCropOrigin", NULL },
 	{ TIFFTAG_DEFAULTCROPSIZE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultCropSize", NULL },
 	{ TIFFTAG_COLORMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ColorMatrix1", NULL },
@@ -175,16 +189,16 @@ tiffFields[] = {
 	{ TIFFTAG_ANALOGBALANCE, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AnalogBalance", NULL },
 	{ TIFFTAG_ASSHOTNEUTRAL, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AsShotNeutral", NULL },
 	{ TIFFTAG_ASSHOTWHITEXY, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "AsShotWhiteXY", NULL },
-	{ TIFFTAG_BASELINEEXPOSURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineExposure", NULL },
-	{ TIFFTAG_BASELINENOISE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineNoise", NULL },
-	{ TIFFTAG_BASELINESHARPNESS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineSharpness", NULL },
+	{ TIFFTAG_BASELINEEXPOSURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineExposure", NULL },
+	{ TIFFTAG_BASELINENOISE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineNoise", NULL },
+	{ TIFFTAG_BASELINESHARPNESS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineSharpness", NULL },
 	{ TIFFTAG_BAYERGREENSPLIT, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BayerGreenSplit", NULL },
-	{ TIFFTAG_LINEARRESPONSELIMIT, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "LinearResponseLimit", NULL },
+	{ TIFFTAG_LINEARRESPONSELIMIT, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "LinearResponseLimit", NULL },
 	{ TIFFTAG_CAMERASERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraSerialNumber", NULL },
 	{ TIFFTAG_LENSINFO, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "LensInfo", NULL },
-	{ TIFFTAG_CHROMABLURRADIUS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ChromaBlurRadius", NULL },
-	{ TIFFTAG_ANTIALIASSTRENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "AntiAliasStrength", NULL },
-	{ TIFFTAG_SHADOWSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ShadowScale", NULL },
+	{ TIFFTAG_CHROMABLURRADIUS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ChromaBlurRadius", NULL },
+	{ TIFFTAG_ANTIALIASSTRENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "AntiAliasStrength", NULL },
+	{ TIFFTAG_SHADOWSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ShadowScale", NULL },
 	{ TIFFTAG_DNGPRIVATEDATA, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "DNGPrivateData", NULL },
 	{ TIFFTAG_MAKERNOTESAFETY, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "MakerNoteSafety", NULL },
 	{ TIFFTAG_CALIBRATIONILLUMINANT1, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "CalibrationIlluminant1", NULL },
@@ -217,47 +231,68 @@ tiffFields[] = {
 	/* begin pseudo tags */
 };
 
+/*
+ * EXIF tags  (Version 2.31, July 2016 plus version 2.32 May 2019)
+ */
 static const TIFFField
 exifFields[] = {
-	{ EXIFTAG_EXPOSURETIME, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureTime", NULL },
-	{ EXIFTAG_FNUMBER, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FNumber", NULL },
+	{ EXIFTAG_EXPOSURETIME, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureTime", NULL },
+	{ EXIFTAG_FNUMBER, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FNumber", NULL },
 	{ EXIFTAG_EXPOSUREPROGRAM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureProgram", NULL },
 	{ EXIFTAG_SPECTRALSENSITIVITY, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SpectralSensitivity", NULL },
 	{ EXIFTAG_ISOSPEEDRATINGS, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ISOSpeedRatings", NULL },
 	{ EXIFTAG_OECF, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OptoelectricConversionFactor", NULL },
+	{ EXIFTAG_SENSITIVITYTYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensitivityType", NULL },
+	{ EXIFTAG_STANDARDOUTPUTSENSITIVITY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "StandardOutputSensitivity", NULL },
+	{ EXIFTAG_RECOMMENDEDEXPOSUREINDEX, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RecommendedExposureIndex", NULL },
+	{ EXIFTAG_ISOSPEED, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeed", NULL },
+	{ EXIFTAG_ISOSPEEDLATITUDEYYY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudeyyy", NULL },
+	{ EXIFTAG_ISOSPEEDLATITUDEZZZ, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudezzz", NULL },
 	{ EXIFTAG_EXIFVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExifVersion", NULL },
 	{ EXIFTAG_DATETIMEORIGINAL, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeOriginal", NULL },
 	{ EXIFTAG_DATETIMEDIGITIZED, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeDigitized", NULL },
+	{ EXIFTAG_OFFSETTIME, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTime", NULL },
+	{ EXIFTAG_OFFSETTIMEORIGINAL, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeOriginal", NULL },
+	{ EXIFTAG_OFFSETTIMEDIGITIZED, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeDigitized", NULL },
 	{ EXIFTAG_COMPONENTSCONFIGURATION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ComponentsConfiguration", NULL },
-	{ EXIFTAG_COMPRESSEDBITSPERPIXEL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompressedBitsPerPixel", NULL },
-	{ EXIFTAG_SHUTTERSPEEDVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ShutterSpeedValue", NULL },
-	{ EXIFTAG_APERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ApertureValue", NULL },
-	{ EXIFTAG_BRIGHTNESSVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BrightnessValue", NULL },
-	{ EXIFTAG_EXPOSUREBIASVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureBiasValue", NULL },
-	{ EXIFTAG_MAXAPERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MaxApertureValue", NULL },
-	{ EXIFTAG_SUBJECTDISTANCE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistance", NULL },
+	{ EXIFTAG_COMPRESSEDBITSPERPIXEL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompressedBitsPerPixel", NULL },
+	{ EXIFTAG_SHUTTERSPEEDVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ShutterSpeedValue", NULL },
+	{ EXIFTAG_APERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ApertureValue", NULL },
+	{ EXIFTAG_BRIGHTNESSVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BrightnessValue", NULL },
+	{ EXIFTAG_EXPOSUREBIASVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureBiasValue", NULL },
+	{ EXIFTAG_MAXAPERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MaxApertureValue", NULL },
+	/*--: EXIFTAG_SUBJECTDISTANCE: LibTiff returns value of "-1" if numerator equals 4294967295 (0xFFFFFFFF) to indicate infinite distance!
+	 * However, there are two other EXIF tags where numerator indicates a special value and six other cases where the denominator indicates special values,
+	 * which are not treated within LibTiff!! */
+	{ EXIFTAG_SUBJECTDISTANCE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistance", NULL },
 	{ EXIFTAG_METERINGMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MeteringMode", NULL },
 	{ EXIFTAG_LIGHTSOURCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LightSource", NULL },
 	{ EXIFTAG_FLASH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Flash", NULL },
-	{ EXIFTAG_FOCALLENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLength", NULL },
+	{ EXIFTAG_FOCALLENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLength", NULL },
 	{ EXIFTAG_SUBJECTAREA, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SubjectArea", NULL },
 	{ EXIFTAG_MAKERNOTE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "MakerNote", NULL },
 	{ EXIFTAG_USERCOMMENT, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "UserComment", NULL },
 	{ EXIFTAG_SUBSECTIME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTime", NULL },
 	{ EXIFTAG_SUBSECTIMEORIGINAL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeOriginal", NULL },
 	{ EXIFTAG_SUBSECTIMEDIGITIZED, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeDigitized", NULL },
+	{ EXIFTAG_TEMPERATURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Temperature", NULL },
+	{ EXIFTAG_HUMIDITY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Humidity", NULL },
+	{ EXIFTAG_PRESSURE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Pressure", NULL },
+	{ EXIFTAG_WATERDEPTH, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WaterDepth", NULL },
+	{ EXIFTAG_ACCELERATION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Acceleration", NULL },
+	{ EXIFTAG_CAMERAELEVATIONANGLE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraElevationAngle", NULL },
 	{ EXIFTAG_FLASHPIXVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashpixVersion", NULL },
 	{ EXIFTAG_COLORSPACE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ColorSpace", NULL },
 	{ EXIFTAG_PIXELXDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelXDimension", NULL },
 	{ EXIFTAG_PIXELYDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelYDimension", NULL },
 	{ EXIFTAG_RELATEDSOUNDFILE, 13, 13, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RelatedSoundFile", NULL },
-	{ EXIFTAG_FLASHENERGY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashEnergy", NULL },
+	{ EXIFTAG_FLASHENERGY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashEnergy", NULL },
 	{ EXIFTAG_SPATIALFREQUENCYRESPONSE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SpatialFrequencyResponse", NULL },
-	{ EXIFTAG_FOCALPLANEXRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneXResolution", NULL },
-	{ EXIFTAG_FOCALPLANEYRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneYResolution", NULL },
+	{ EXIFTAG_FOCALPLANEXRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneXResolution", NULL },
+	{ EXIFTAG_FOCALPLANEYRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneYResolution", NULL },
 	{ EXIFTAG_FOCALPLANERESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneResolutionUnit", NULL },
 	{ EXIFTAG_SUBJECTLOCATION, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectLocation", NULL },
-	{ EXIFTAG_EXPOSUREINDEX, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureIndex", NULL },
+	{ EXIFTAG_EXPOSUREINDEX, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureIndex", NULL },
 	{ EXIFTAG_SENSINGMETHOD, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensingMethod", NULL },
 	{ EXIFTAG_FILESOURCE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FileSource", NULL },
 	{ EXIFTAG_SCENETYPE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneType", NULL },
@@ -265,22 +300,79 @@ exifFields[] = {
 	{ EXIFTAG_CUSTOMRENDERED, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CustomRendered", NULL },
 	{ EXIFTAG_EXPOSUREMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureMode", NULL },
 	{ EXIFTAG_WHITEBALANCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WhiteBalance", NULL },
-	{ EXIFTAG_DIGITALZOOMRATIO, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DigitalZoomRatio", NULL },
+	{ EXIFTAG_DIGITALZOOMRATIO, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DigitalZoomRatio", NULL },
 	{ EXIFTAG_FOCALLENGTHIN35MMFILM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLengthIn35mmFilm", NULL },
 	{ EXIFTAG_SCENECAPTURETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneCaptureType", NULL },
-	{ EXIFTAG_GAINCONTROL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GainControl", NULL },
+	{ EXIFTAG_GAINCONTROL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GainControl", NULL },
 	{ EXIFTAG_CONTRAST, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Contrast", NULL },
 	{ EXIFTAG_SATURATION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Saturation", NULL },
 	{ EXIFTAG_SHARPNESS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Sharpness", NULL },
 	{ EXIFTAG_DEVICESETTINGDESCRIPTION, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "DeviceSettingDescription", NULL },
 	{ EXIFTAG_SUBJECTDISTANCERANGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistanceRange", NULL },
-	{ EXIFTAG_IMAGEUNIQUEID, 33, 33, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageUniqueID", NULL }
+	{ EXIFTAG_IMAGEUNIQUEID, 33, 33, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageUniqueID", NULL },
+	{ EXIFTAG_CAMERAOWNERNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraOwnerName", NULL },
+	{ EXIFTAG_BODYSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BodySerialNumber", NULL },
+	{ EXIFTAG_LENSSPECIFICATION, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSpecification", NULL },
+	{ EXIFTAG_LENSMAKE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensMake", NULL },
+	{ EXIFTAG_LENSMODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensModel", NULL },
+	{ EXIFTAG_LENSSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSerialNumber", NULL },
+	{ EXIFTAG_GAMMA, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Gamma", NULL },
+	{ EXIFTAG_COMPOSITEIMAGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompositeImage", NULL },
+	{ EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SourceImageNumberOfCompositeImage", NULL },
+	{ EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SourceExposureTimesOfCompositeImage", NULL }
+};
+/*
+ * EXIF-GPS tags  (Version 2.31, July 2016; nothing changed for version 2.32 May 2019)
+ */
+
+static TIFFField
+gpsFields[] = {
+	/*  For the GPS tag definitions in gpsFields[] the standard definition for Rationals is TIFF_SETGET_DOUBLE and TIFF_SETGET_C0_FLOAT.
+	 *-- ATTENTION: After the upgrade with Rational2Double, the GPSTAG values can now be written and also read in double precision!
+	 *              In order to achieve double precision for GPS tags:
+	 *              Standard definitions for GPSTAG is kept to TIFF_SETGET_DOUBLE 
+	 *              and TIFF_SETGET_C0_FLOAT is changed to TIFF_SETGET_C0_DOUBLE.
+	 */
+	{		GPSTAG_VERSIONID	, 4, 4, 	TIFF_BYTE	, 0, 	TIFF_SETGET_C0_UINT8	, TIFF_SETGET_UINT8	, FIELD_CUSTOM	, 1, 0, 	"VersionID", NULL },
+	{		GPSTAG_LATITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"LatitudeRef", NULL },
+	{		GPSTAG_LATITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Latitude", NULL },
+	{		GPSTAG_LONGITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"LongitudeRef", NULL },
+	{		GPSTAG_LONGITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Longitude", NULL },
+	{		GPSTAG_ALTITUDEREF	, 1, 1,	TIFF_BYTE	, 0, 	TIFF_SETGET_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"AltitudeRef", NULL },
+	{		GPSTAG_ALTITUDE	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Altitude", NULL },
+	{		GPSTAG_TIMESTAMP	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"TimeStamp", NULL },
+	{		GPSTAG_SATELLITES	, -1, -1,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Satellites", NULL },
+	{		GPSTAG_STATUS	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Status", NULL },
+	{		GPSTAG_MEASUREMODE	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"MeasureMode", NULL },
+	{		GPSTAG_DOP	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DOP", NULL },
+	{		GPSTAG_SPEEDREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"SpeedRef", NULL },
+	{		GPSTAG_SPEED	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Speed", NULL },
+	{		GPSTAG_TRACKREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"TrackRef", NULL },
+	{		GPSTAG_TRACK	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Track", NULL },
+	{		GPSTAG_IMGDIRECTIONREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"ImgDirectionRef", NULL },
+	{		GPSTAG_IMGDIRECTION	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"ImgDirection", NULL },
+	{		GPSTAG_MAPDATUM	, -1, -1,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"MapDatum", NULL },
+	{		GPSTAG_DESTLATITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLatitudeRef", NULL },
+	{		GPSTAG_DESTLATITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLatitude", NULL },
+	{		GPSTAG_DESTLONGITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLongitudeRef", NULL },
+	{		GPSTAG_DESTLONGITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLongitude", NULL },
+	{		GPSTAG_DESTBEARINGREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestBearingRef", NULL },
+	{		GPSTAG_DESTBEARING	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestBearing", NULL },
+	{		GPSTAG_DESTDISTANCEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestDistanceRef", NULL },
+	{		GPSTAG_DESTDISTANCE	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestDistance", NULL },
+	{		GPSTAG_PROCESSINGMETHOD	, -1, -1,	TIFF_UNDEFINED	, 0, 	TIFF_SETGET_C16_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 1, 	"ProcessingMethod", NULL },
+	{		GPSTAG_AREAINFORMATION	, -1, -1,	TIFF_UNDEFINED	, 0, 	TIFF_SETGET_C16_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 1, 	"AreaInformation", NULL },
+	{		GPSTAG_DATESTAMP	, 11, 11,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DateStamp", NULL },
+	{		GPSTAG_DIFFERENTIAL	, 1, 1,	TIFF_SHORT	, 0, 	TIFF_SETGET_UINT16	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Differential", NULL },
+	{		GPSTAG_GPSHPOSITIONINGERROR	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"HorizontalPositioningError", NULL }
 };
 
 static const TIFFFieldArray
 tiffFieldArray = { tfiatImage, 0, TIFFArrayCount(tiffFields), (TIFFField*) tiffFields };
 static const TIFFFieldArray
 exifFieldArray = { tfiatExif, 0, TIFFArrayCount(exifFields), (TIFFField*) exifFields };
+static const TIFFFieldArray
+gpsFieldArray = { tfiatGps, 0, TIFFArrayCount(gpsFields), (TIFFField*) gpsFields };
 
 /*
  *  We have our own local lfind() equivalent to avoid subtle differences
@@ -313,6 +405,12 @@ _TIFFGetExifFields(void)
 	return(&exifFieldArray);
 }
 
+const TIFFFieldArray*
+_TIFFGetGpsFields(void)
+{
+	return(&gpsFieldArray);
+}
+
 void
 _TIFFSetupFields(TIFF* tif, const TIFFFieldArray* fieldarray)
 {
@@ -502,6 +600,82 @@ _TIFFDataSize(TIFFDataType type)
 	}
 }
 
+/*
+ * Rational2Double: 
+ * Return size of TIFFSetGetFieldType in bytes.
+ *
+ * XXX: TIFF_RATIONAL values for FIELD_CUSTOM are stored internally as 4-byte float.
+ * However, some of them should be stored internally as 8-byte double. 
+ * This is now managed by the SetGetField of the tag-definition!
+ */
+int
+_TIFFSetGetFieldSize(TIFFSetGetFieldType setgettype)
+{
+	switch (setgettype)
+	{
+		case TIFF_SETGET_UNDEFINED:
+		case TIFF_SETGET_ASCII:
+		case TIFF_SETGET_C0_ASCII:
+		case TIFF_SETGET_C16_ASCII:
+		case TIFF_SETGET_C32_ASCII:
+		case TIFF_SETGET_OTHER:
+		    return 0;
+		case TIFF_SETGET_UINT8:
+		case TIFF_SETGET_SINT8:
+		case TIFF_SETGET_C0_UINT8:
+		case TIFF_SETGET_C0_SINT8:
+		case TIFF_SETGET_C16_UINT8:
+		case TIFF_SETGET_C16_SINT8:
+		case TIFF_SETGET_C32_UINT8:
+		case TIFF_SETGET_C32_SINT8:
+		    return 1;
+		case TIFF_SETGET_UINT16:
+		case TIFF_SETGET_SINT16:
+		case TIFF_SETGET_C0_UINT16:
+		case TIFF_SETGET_C0_SINT16:
+		case TIFF_SETGET_C16_UINT16:
+		case TIFF_SETGET_C16_SINT16:
+		case TIFF_SETGET_C32_UINT16:
+		case TIFF_SETGET_C32_SINT16:
+		    return 2;
+		case TIFF_SETGET_INT:
+		case TIFF_SETGET_UINT32:
+		case TIFF_SETGET_SINT32:
+		case TIFF_SETGET_FLOAT:
+		case TIFF_SETGET_UINT16_PAIR:
+		case TIFF_SETGET_C0_UINT32:
+		case TIFF_SETGET_C0_SINT32:
+		case TIFF_SETGET_C0_FLOAT:
+		case TIFF_SETGET_C16_UINT32:
+		case TIFF_SETGET_C16_SINT32:
+		case TIFF_SETGET_C16_FLOAT:
+		case TIFF_SETGET_C32_UINT32:
+		case TIFF_SETGET_C32_SINT32:
+		case TIFF_SETGET_C32_FLOAT:
+		    return 4;
+		case TIFF_SETGET_UINT64:
+		case TIFF_SETGET_SINT64:
+		case TIFF_SETGET_DOUBLE:
+		case TIFF_SETGET_IFD8:
+		case TIFF_SETGET_C0_UINT64:
+		case TIFF_SETGET_C0_SINT64:
+		case TIFF_SETGET_C0_DOUBLE:
+		case TIFF_SETGET_C0_IFD8:
+		case TIFF_SETGET_C16_UINT64:
+		case TIFF_SETGET_C16_SINT64:
+		case TIFF_SETGET_C16_DOUBLE:
+		case TIFF_SETGET_C16_IFD8:
+		case TIFF_SETGET_C32_UINT64:
+		case TIFF_SETGET_C32_SINT64:
+		case TIFF_SETGET_C32_DOUBLE:
+		case TIFF_SETGET_C32_IFD8:
+		    return 8;
+		default:
+		    return 0;
+	}
+} /*-- _TIFFSetGetFieldSize --- */
+
+
 const TIFFField*
 TIFFFindField(TIFF* tif, uint32 tag, TIFFDataType dt)
 {
@@ -1062,10 +1236,6 @@ _TIFFCheckFieldIsValidForCodec(TIFF *tif, ttag_t tag)
 		if (tag == TIFFTAG_LERC_PARAMETERS)
 		    return 1;
 		break;
-		  case COMPRESSION_WEBP:
-		if (tag == TIFFTAG_PREDICTOR)
-				return 1;
-		break;
 	}
 	return 0;
 }
diff --git a/3rdparty/libtiff/tif_dirread.c b/3rdparty/libtiff/tif_dirread.c
index 1fdcb0997a..ba127ca917 100644
--- a/3rdparty/libtiff/tif_dirread.c
+++ b/3rdparty/libtiff/tif_dirread.c
@@ -29,9 +29,6 @@
  */
 
 /* Suggested pending improvements:
- * - add a field 'ignore' to the TIFFDirEntry structure, to flag status,
- *   eliminating current use of the IGNORE value, and therefore eliminating
- *   current irrational behaviour on tags with tag id code 0
  * - add a field 'field_info' to the TIFFDirEntry structure, and set that with
  *   the pointer to the appropriate TIFFField structure early on in
  *   TIFFReadDirectory, so as to eliminate current possibly repetitive lookup.
@@ -41,9 +38,13 @@
 #include <float.h>
 #include <stdlib.h>
 
-#define IGNORE 0          /* tag placeholder used below */
 #define FAILED_FII    ((uint32) -1)
 
+/*
+ * Largest 64-bit signed integer value.
+ */
+#define TIFF_INT64_MAX ((int64)(TIFF_UINT64_MAX >> 1))
+
 #ifdef HAVE_IEEEFP
 # define TIFFCvtIEEEFloatToNative(tif, n, fp)
 # define TIFFCvtIEEEDoubleToNative(tif, n, dp)
@@ -206,6 +207,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryByte(TIFF* tif, TIFFDirEntry* di
 	switch (direntry->tdir_type)
 	{
 		case TIFF_BYTE:
+		case TIFF_UNDEFINED:	/* Support to read TIFF_UNDEFINED with field_readcount==1 */
 			TIFFReadDirEntryCheckedByte(tif,direntry,value);
 			return(TIFFReadDirEntryErrOk);
 		case TIFF_SBYTE:
@@ -637,7 +639,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryFloat(TIFF* tif, TIFFDirEntry* d
 				err=TIFFReadDirEntryCheckedDouble(tif,direntry,&m);
 				if (err!=TIFFReadDirEntryErrOk)
 					return(err);
-				if ((m > FLT_MAX) || (m < FLT_MIN))
+				if ((m > FLT_MAX) || (m < -FLT_MAX))
 					return(TIFFReadDirEntryErrRange);
 				*value=(float)m;
 				return(TIFFReadDirEntryErrOk);
@@ -836,6 +838,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
 	uint32 datasize;
 	void* data;
         uint64 target_count64;
+        int original_datasize_clamped;
 	typesize=TIFFDataWidth(direntry->tdir_type);
 
         target_count64 = (direntry->tdir_count > maxcount) ?
@@ -848,6 +851,12 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
 	}
         (void) desttypesize;
 
+        /* We just want to know if the original tag size is more than 4 bytes
+         * (classic TIFF) or 8 bytes (BigTIFF)
+         */
+        original_datasize_clamped =
+            ((direntry->tdir_count > 10) ? 10 : (int)direntry->tdir_count) * typesize;
+
         /* 
          * As a sanity check, make sure we have no more than a 2GB tag array 
          * in either the current data type or the dest data type.  This also
@@ -862,7 +871,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
 	datasize=(*count)*typesize;
 	assert((tmsize_t)datasize>0);
 
-	if( isMapped(tif) && datasize > (uint32)tif->tif_size )
+	if( isMapped(tif) && datasize > (uint64)tif->tif_size )
 		return TIFFReadDirEntryErrIo;
 
 	if( !isMapped(tif) &&
@@ -879,7 +888,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
 	}
 	if (!(tif->tif_flags&TIFF_BIGTIFF))
 	{
-		if (datasize<=4)
+		if (original_datasize_clamped<=4)
 			_TIFFmemcpy(data,&direntry->tdir_offset,datasize);
 		else
 		{
@@ -900,7 +909,7 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
 	}
 	else
 	{
-		if (datasize<=8)
+		if (original_datasize_clamped<=8)
 			_TIFFmemcpy(data,&direntry->tdir_offset,datasize);
 		else
 		{
@@ -3288,11 +3297,6 @@ static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSlong(int32 value)
 		return(TIFFReadDirEntryErrOk);
 }
 
-/*
- * Largest 32-bit unsigned integer value.
- */
-#define TIFF_UINT32_MAX 0xFFFFFFFFU
-
 static enum TIFFReadDirEntryErr
 TIFFReadDirEntryCheckRangeLongLong8(uint64 value)
 {
@@ -3311,8 +3315,6 @@ TIFFReadDirEntryCheckRangeLongSlong8(int64 value)
 		return(TIFFReadDirEntryErrOk);
 }
 
-#undef TIFF_UINT32_MAX
-
 static enum TIFFReadDirEntryErr
 TIFFReadDirEntryCheckRangeSlongLong(uint32 value)
 {
@@ -3378,11 +3380,6 @@ TIFFReadDirEntryCheckRangeLong8Slong8(int64 value)
 		return(TIFFReadDirEntryErrOk);
 }
 
-/*
- * Largest 64-bit signed integer value.
- */
-#define TIFF_INT64_MAX ((int64)(((uint64) ~0) >> 1))
-
 static enum TIFFReadDirEntryErr
 TIFFReadDirEntryCheckRangeSlong8Long8(uint64 value)
 {
@@ -3392,8 +3389,6 @@ TIFFReadDirEntryCheckRangeSlong8Long8(uint64 value)
 		return(TIFFReadDirEntryErrOk);
 }
 
-#undef TIFF_INT64_MAX
-
 static enum TIFFReadDirEntryErr
 TIFFReadDirEntryData(TIFF* tif, uint64 offset, tmsize_t size, void* dest)
 {
@@ -3406,13 +3401,13 @@ TIFFReadDirEntryData(TIFF* tif, uint64 offset, tmsize_t size, void* dest)
 	} else {
 		size_t ma,mb;
 		ma=(size_t)offset;
+                if( (uint64)ma!=offset ||
+                    ma > (~(size_t)0) - (size_t)size )
+                {
+                    return TIFFReadDirEntryErrIo;
+                }
 		mb=ma+size;
-		if (((uint64)ma!=offset)
-		    || (mb < ma)
-		    || (mb - ma != (size_t) size)
-		    || (mb < (size_t)size)
-		    || (mb > (size_t)tif->tif_size)
-		    )
+		if (mb > (uint64)tif->tif_size)
 			return(TIFFReadDirEntryErrIo);
 		_TIFFmemcpy(dest,tif->tif_base+ma,size);
 	}
@@ -3535,6 +3530,49 @@ static int _TIFFGetMaxColorChannels( uint16 photometric )
     }
 }
 
+static int ByteCountLooksBad(TIFF* tif)
+{
+    /*
+        * Assume we have wrong StripByteCount value (in case
+        * of single strip) in following cases:
+        *   - it is equal to zero along with StripOffset;
+        *   - it is larger than file itself (in case of uncompressed
+        *     image);
+        *   - it is smaller than the size of the bytes per row
+        *     multiplied on the number of rows.  The last case should
+        *     not be checked in the case of writing new image,
+        *     because we may do not know the exact strip size
+        *     until the whole image will be written and directory
+        *     dumped out.
+        */
+    uint64 bytecount = TIFFGetStrileByteCount(tif, 0);
+    uint64 offset = TIFFGetStrileOffset(tif, 0);
+    uint64 filesize;
+
+    if( offset == 0 )
+        return 0;
+    if (bytecount == 0)
+        return 1;
+    if ( tif->tif_dir.td_compression != COMPRESSION_NONE )
+        return 0;
+    filesize = TIFFGetFileSize(tif);
+    if( offset <= filesize && bytecount > filesize - offset )
+        return 1;
+    if( tif->tif_mode == O_RDONLY )
+    {
+        uint64 scanlinesize = TIFFScanlineSize64(tif);
+        if( tif->tif_dir.td_imagelength > 0 &&
+            scanlinesize > TIFF_UINT64_MAX / tif->tif_dir.td_imagelength )
+        {
+            return 1;
+        }
+        if( bytecount < scanlinesize * tif->tif_dir.td_imagelength)
+            return 1;
+    }
+    return 0;
+}
+
+
 /*
  * Read the next TIFF directory from a file and convert it to the internal
  * format. We read directories sequentially.
@@ -3581,14 +3619,17 @@ TIFFReadDirectory(TIFF* tif)
 			uint16 nb;
 			for (na=ma+1, nb=mb+1; nb<dircount; na++, nb++)
 			{
-				if (ma->tdir_tag==na->tdir_tag)
-					na->tdir_tag=IGNORE;
+				if (ma->tdir_tag == na->tdir_tag) {
+					na->tdir_ignore = TRUE;
+				}
 			}
 		}
 	}
         
 	tif->tif_flags &= ~TIFF_BEENWRITING;    /* reset before new dir */
 	tif->tif_flags &= ~TIFF_BUF4WRITE;      /* reset before new dir */
+	tif->tif_flags &= ~TIFF_CHOPPEDUPARRAYS;
+
 	/* free any old stuff and reinit */
 	TIFFFreeDirectory(tif);
 	TIFFDefaultDirectory(tif);
@@ -3621,7 +3662,7 @@ TIFFReadDirectory(TIFF* tif)
 	{
 		if (!TIFFFetchNormalTag(tif,dp,0))
 			goto bad;
-		dp->tdir_tag=IGNORE;
+		dp->tdir_ignore = TRUE;
 	}
 	dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,TIFFTAG_COMPRESSION);
 	if (dp)
@@ -3644,7 +3685,7 @@ TIFFReadDirectory(TIFF* tif)
 		}
 		if (!TIFFSetField(tif,TIFFTAG_COMPRESSION,value))
 			goto bad;
-		dp->tdir_tag=IGNORE;
+		dp->tdir_ignore = TRUE;
 	}
 	else
 	{
@@ -3656,7 +3697,7 @@ TIFFReadDirectory(TIFF* tif)
 	 */
 	for (di=0, dp=dir; di<dircount; di++, dp++)
 	{
-		if (dp->tdir_tag!=IGNORE)
+		if (!dp->tdir_ignore)
 		{
 			TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
 			if (fii == FAILED_FII)
@@ -3664,8 +3705,8 @@ TIFFReadDirectory(TIFF* tif)
 				TIFFWarningExt(tif->tif_clientdata, module,
 				    "Unknown field with tag %d (0x%x) encountered",
 				    dp->tdir_tag,dp->tdir_tag);
-                                /* the following knowingly leaks the 
-                                   anonymous field structure */
+				/* the following knowingly leaks the 
+				   anonymous field structure */
 				if (!_TIFFMergeFields(tif,
 					_TIFFCreateAnonField(tif,
 						dp->tdir_tag,
@@ -3676,18 +3717,18 @@ TIFFReadDirectory(TIFF* tif)
 					    "Registering anonymous field with tag %d (0x%x) failed",
 					    dp->tdir_tag,
 					    dp->tdir_tag);
-					dp->tdir_tag=IGNORE;
+					dp->tdir_ignore = TRUE;
 				} else {
 					TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
 					assert(fii != FAILED_FII);
 				}
 			}
 		}
-		if (dp->tdir_tag!=IGNORE)
+		if (!dp->tdir_ignore)
 		{
 			fip=tif->tif_fields[fii];
 			if (fip->field_bit==FIELD_IGNORE)
-				dp->tdir_tag=IGNORE;
+				dp->tdir_ignore = TRUE;
 			else
 			{
 				switch (dp->tdir_tag)
@@ -3709,12 +3750,12 @@ TIFFReadDirectory(TIFF* tif)
 					case TIFFTAG_EXTRASAMPLES:
 						if (!TIFFFetchNormalTag(tif,dp,0))
 							goto bad;
-						dp->tdir_tag=IGNORE;
+						dp->tdir_ignore = TRUE;
+						break;
+					default:
+						if( !_TIFFCheckFieldIsValidForCodec(tif, dp->tdir_tag) )
+							dp->tdir_ignore = TRUE;
 						break;
-                                        default:
-                                            if( !_TIFFCheckFieldIsValidForCodec(tif, dp->tdir_tag) )
-                                                dp->tdir_tag=IGNORE;
-                                            break;
 				}
 			}
 		}
@@ -3730,8 +3771,8 @@ TIFFReadDirectory(TIFF* tif)
 	if ((tif->tif_dir.td_compression==COMPRESSION_OJPEG)&&
 	    (tif->tif_dir.td_planarconfig==PLANARCONFIG_SEPARATE))
 	{
-        if (!_TIFFFillStriles(tif))
-            goto bad;
+		if (!_TIFFFillStriles(tif))
+		    goto bad;
 		dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,TIFFTAG_STRIPOFFSETS);
 		if ((dp!=0)&&(dp->tdir_count==1))
 		{
@@ -3803,190 +3844,240 @@ TIFFReadDirectory(TIFF* tif)
 	 */
 	for (di=0, dp=dir; di<dircount; di++, dp++)
 	{
-		switch (dp->tdir_tag)
-		{
-			case IGNORE:
-				break;
-			case TIFFTAG_MINSAMPLEVALUE:
-			case TIFFTAG_MAXSAMPLEVALUE:
-			case TIFFTAG_BITSPERSAMPLE:
-			case TIFFTAG_DATATYPE:
-			case TIFFTAG_SAMPLEFORMAT:
-				/*
-				 * The MinSampleValue, MaxSampleValue, BitsPerSample
-				 * DataType and SampleFormat tags are supposed to be
-				 * written as one value/sample, but some vendors
-				 * incorrectly write one value only -- so we accept
-				 * that as well (yuck). Other vendors write correct
-				 * value for NumberOfSamples, but incorrect one for
-				 * BitsPerSample and friends, and we will read this
-				 * too.
-				 */
-				{
-					uint16 value;
-					enum TIFFReadDirEntryErr err;
-					err=TIFFReadDirEntryShort(tif,dp,&value);
-					if (err==TIFFReadDirEntryErrCount)
-						err=TIFFReadDirEntryPersampleShort(tif,dp,&value);
-					if (err!=TIFFReadDirEntryErrOk)
+		if (!dp->tdir_ignore) {
+			switch (dp->tdir_tag) 
+			{
+				case TIFFTAG_MINSAMPLEVALUE:
+				case TIFFTAG_MAXSAMPLEVALUE:
+				case TIFFTAG_BITSPERSAMPLE:
+				case TIFFTAG_DATATYPE:
+				case TIFFTAG_SAMPLEFORMAT:
+					/*
+					 * The MinSampleValue, MaxSampleValue, BitsPerSample
+					 * DataType and SampleFormat tags are supposed to be
+					 * written as one value/sample, but some vendors
+					 * incorrectly write one value only -- so we accept
+					 * that as well (yuck). Other vendors write correct
+					 * value for NumberOfSamples, but incorrect one for
+					 * BitsPerSample and friends, and we will read this
+					 * too.
+					 */
 					{
-						fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-						TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
-						goto bad;
-					}
-					if (!TIFFSetField(tif,dp->tdir_tag,value))
-						goto bad;
-                    if( dp->tdir_tag == TIFFTAG_BITSPERSAMPLE )
-                        bitspersample_read = TRUE;
-				}
-				break;
-			case TIFFTAG_SMINSAMPLEVALUE:
-			case TIFFTAG_SMAXSAMPLEVALUE:
-				{
-
-					double *data = NULL;
-					enum TIFFReadDirEntryErr err;
-					uint32 saved_flags;
-					int m;
-					if (dp->tdir_count != (uint64)tif->tif_dir.td_samplesperpixel)
-						err = TIFFReadDirEntryErrCount;
-					else
-						err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-						TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
-						goto bad;
-					}
-					saved_flags = tif->tif_flags;
-					tif->tif_flags |= TIFF_PERSAMPLE;
-					m = TIFFSetField(tif,dp->tdir_tag,data);
-					tif->tif_flags = saved_flags;
-					_TIFFfree(data);
-					if (!m)
-						goto bad;
-				}
-				break;
-			case TIFFTAG_STRIPOFFSETS:
-			case TIFFTAG_TILEOFFSETS:
-#if defined(DEFER_STRILE_LOAD)
-                                _TIFFmemcpy( &(tif->tif_dir.td_stripoffset_entry),
-                                             dp, sizeof(TIFFDirEntry) );
-#else                          
-                                if( tif->tif_dir.td_stripoffset != NULL )
-                                {
-                                    TIFFErrorExt(tif->tif_clientdata, module,
-                                        "tif->tif_dir.td_stripoffset is "
-                                        "already allocated. Likely duplicated "
-                                        "StripOffsets/TileOffsets tag");
-                                    goto bad;
-                                }
-				if (!TIFFFetchStripThing(tif,dp,tif->tif_dir.td_nstrips,&tif->tif_dir.td_stripoffset))  
-					goto bad;
-#endif                                
-				break;
-			case TIFFTAG_STRIPBYTECOUNTS:
-			case TIFFTAG_TILEBYTECOUNTS:
-#if defined(DEFER_STRILE_LOAD)
-                                _TIFFmemcpy( &(tif->tif_dir.td_stripbytecount_entry),
-                                             dp, sizeof(TIFFDirEntry) );
-#else                          
-                                if( tif->tif_dir.td_stripbytecount != NULL )
-                                {
-                                    TIFFErrorExt(tif->tif_clientdata, module,
-                                        "tif->tif_dir.td_stripbytecount is "
-                                        "already allocated. Likely duplicated "
-                                        "StripByteCounts/TileByteCounts tag");
-                                    goto bad;
-                                }
-                                if (!TIFFFetchStripThing(tif,dp,tif->tif_dir.td_nstrips,&tif->tif_dir.td_stripbytecount))  
-					goto bad;
-#endif                                
-				break;
-			case TIFFTAG_COLORMAP:
-			case TIFFTAG_TRANSFERFUNCTION:
-				{
-					enum TIFFReadDirEntryErr err;
-					uint32 countpersample;
-					uint32 countrequired;
-					uint32 incrementpersample;
-					uint16* value=NULL;
-                    /* It would be dangerous to instantiate those tag values */
-                    /* since if td_bitspersample has not yet been read (due to */
-                    /* unordered tags), it could be read afterwards with a */
-                    /* values greater than the default one (1), which may cause */
-                    /* crashes in user code */
-                    if( !bitspersample_read )
-                    {
-                        fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-                        TIFFWarningExt(tif->tif_clientdata,module,
-                                       "Ignoring %s since BitsPerSample tag not found",
-                                       fip ? fip->field_name : "unknown tagname");
-                        continue;
-                    }
-					/* ColorMap or TransferFunction for high bit */
-					/* depths do not make much sense and could be */
-					/* used as a denial of service vector */
-					if (tif->tif_dir.td_bitspersample > 24)
-					{
-					    fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-					    TIFFWarningExt(tif->tif_clientdata,module,
-						"Ignoring %s because BitsPerSample=%d>24",
-						fip ? fip->field_name : "unknown tagname",
-						tif->tif_dir.td_bitspersample);
-					    continue;
-					}
-					countpersample=(1U<<tif->tif_dir.td_bitspersample);
-					if ((dp->tdir_tag==TIFFTAG_TRANSFERFUNCTION)&&(dp->tdir_count==(uint64)countpersample))
-					{
-						countrequired=countpersample;
-						incrementpersample=0;
-					}
-					else
-					{
-						countrequired=3*countpersample;
-						incrementpersample=countpersample;
-					}
-					if (dp->tdir_count!=(uint64)countrequired)
-						err=TIFFReadDirEntryErrCount;
-					else
-						err=TIFFReadDirEntryShortArray(tif,dp,&value);
-					if (err!=TIFFReadDirEntryErrOk)
-                    {
-						fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-						TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",1);
-                    }
-					else
-					{
-						TIFFSetField(tif,dp->tdir_tag,value,value+incrementpersample,value+2*incrementpersample);
-						_TIFFfree(value);
-					}
-				}
-				break;
-/* BEGIN REV 4.0 COMPATIBILITY */
-			case TIFFTAG_OSUBFILETYPE:
-				{
-					uint16 valueo;
-					uint32 value;
-					if (TIFFReadDirEntryShort(tif,dp,&valueo)==TIFFReadDirEntryErrOk)
-					{
-						switch (valueo)
+						uint16 value;
+						enum TIFFReadDirEntryErr err;
+						err=TIFFReadDirEntryShort(tif,dp,&value);
+						if (err==TIFFReadDirEntryErrCount)
+							err=TIFFReadDirEntryPersampleShort(tif,dp,&value);
+						if (err!=TIFFReadDirEntryErrOk)
 						{
-							case OFILETYPE_REDUCEDIMAGE: value=FILETYPE_REDUCEDIMAGE; break;
-							case OFILETYPE_PAGE: value=FILETYPE_PAGE; break;
-							default: value=0; break;
+							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
+							goto bad;
 						}
-						if (value!=0)
-							TIFFSetField(tif,TIFFTAG_SUBFILETYPE,value);
+						if (!TIFFSetField(tif,dp->tdir_tag,value))
+							goto bad;
+						if( dp->tdir_tag == TIFFTAG_BITSPERSAMPLE )
+						    bitspersample_read = TRUE;
 					}
-				}
-				break;
+					break;
+				case TIFFTAG_SMINSAMPLEVALUE:
+				case TIFFTAG_SMAXSAMPLEVALUE:
+					{
+
+						double *data = NULL;
+						enum TIFFReadDirEntryErr err;
+						uint32 saved_flags;
+						int m;
+						if (dp->tdir_count != (uint64)tif->tif_dir.td_samplesperpixel)
+							err = TIFFReadDirEntryErrCount;
+						else
+							err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
+						if (err!=TIFFReadDirEntryErrOk)
+						{
+							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
+							goto bad;
+						}
+						saved_flags = tif->tif_flags;
+						tif->tif_flags |= TIFF_PERSAMPLE;
+						m = TIFFSetField(tif,dp->tdir_tag,data);
+						tif->tif_flags = saved_flags;
+						_TIFFfree(data);
+						if (!m)
+							goto bad;
+					}
+					break;
+				case TIFFTAG_STRIPOFFSETS:
+				case TIFFTAG_TILEOFFSETS:
+					switch( dp->tdir_type )
+					{
+					    case TIFF_SHORT:
+					    case TIFF_LONG:
+					    case TIFF_LONG8:
+					        break;
+					    default:
+                                                /* Warn except if directory typically created with TIFFDeferStrileArrayWriting() */
+                                                if( !(tif->tif_mode == O_RDWR &&
+                                                      dp->tdir_count == 0 &&
+                                                      dp->tdir_type == 0 &&
+                                                      dp->tdir_offset.toff_long8 == 0) )
+                                                {
+                                                    fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+                                                    TIFFWarningExt(tif->tif_clientdata,module,
+                                                                   "Invalid data type for tag %s",
+                                                                   fip ? fip->field_name : "unknown tagname");
+                                                }
+                                                break;
+                                        }
+					_TIFFmemcpy( &(tif->tif_dir.td_stripoffset_entry),
+					   dp, sizeof(TIFFDirEntry) );
+					break;
+				case TIFFTAG_STRIPBYTECOUNTS:
+				case TIFFTAG_TILEBYTECOUNTS:
+					switch( dp->tdir_type )
+					{
+					    case TIFF_SHORT:
+					    case TIFF_LONG:
+					    case TIFF_LONG8:
+					        break;
+					    default:
+						/* Warn except if directory typically created with TIFFDeferStrileArrayWriting() */
+                                                if( !(tif->tif_mode == O_RDWR &&
+                                                      dp->tdir_count == 0 &&
+                                                      dp->tdir_type == 0 &&
+                                                      dp->tdir_offset.toff_long8 == 0) )
+                                                {
+                                                    fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+                                                    TIFFWarningExt(tif->tif_clientdata,module,
+                                                                   "Invalid data type for tag %s",
+                                                                   fip ? fip->field_name : "unknown tagname");
+                                                }
+                                                break;
+                                        }
+					_TIFFmemcpy( &(tif->tif_dir.td_stripbytecount_entry),
+					   dp, sizeof(TIFFDirEntry) );
+					break;
+				case TIFFTAG_COLORMAP:
+				case TIFFTAG_TRANSFERFUNCTION:
+					{
+						enum TIFFReadDirEntryErr err;
+						uint32 countpersample;
+						uint32 countrequired;
+						uint32 incrementpersample;
+						uint16* value=NULL;
+						/* It would be dangerous to instantiate those tag values */
+						/* since if td_bitspersample has not yet been read (due to */
+						/* unordered tags), it could be read afterwards with a */
+						/* values greater than the default one (1), which may cause */
+						/* crashes in user code */
+						if( !bitspersample_read )
+						{
+							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+							TIFFWarningExt(tif->tif_clientdata,module,
+								"Ignoring %s since BitsPerSample tag not found",
+								fip ? fip->field_name : "unknown tagname");
+							continue;
+						}
+						/* ColorMap or TransferFunction for high bit */
+						/* depths do not make much sense and could be */
+						/* used as a denial of service vector */
+						if (tif->tif_dir.td_bitspersample > 24)
+						{
+							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+							TIFFWarningExt(tif->tif_clientdata,module,
+								"Ignoring %s because BitsPerSample=%d>24",
+								fip ? fip->field_name : "unknown tagname",
+								tif->tif_dir.td_bitspersample);
+							continue;
+						}
+						countpersample=(1U<<tif->tif_dir.td_bitspersample);
+						if ((dp->tdir_tag==TIFFTAG_TRANSFERFUNCTION)&&(dp->tdir_count==(uint64)countpersample))
+						{
+							countrequired=countpersample;
+							incrementpersample=0;
+						}
+						else
+						{
+							countrequired=3*countpersample;
+							incrementpersample=countpersample;
+						}
+						if (dp->tdir_count!=(uint64)countrequired)
+							err=TIFFReadDirEntryErrCount;
+						else
+							err=TIFFReadDirEntryShortArray(tif,dp,&value);
+						if (err!=TIFFReadDirEntryErrOk)
+						{
+							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
+							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",1);
+						}
+						else
+						{
+							TIFFSetField(tif,dp->tdir_tag,value,value+incrementpersample,value+2*incrementpersample);
+							_TIFFfree(value);
+						}
+					}
+					break;
+/* BEGIN REV 4.0 COMPATIBILITY */
+				case TIFFTAG_OSUBFILETYPE:
+					{
+						uint16 valueo;
+						uint32 value;
+						if (TIFFReadDirEntryShort(tif,dp,&valueo)==TIFFReadDirEntryErrOk)
+						{
+							switch (valueo)
+							{
+								case OFILETYPE_REDUCEDIMAGE: value=FILETYPE_REDUCEDIMAGE; break;
+								case OFILETYPE_PAGE: value=FILETYPE_PAGE; break;
+								default: value=0; break;
+							}
+							if (value!=0)
+								TIFFSetField(tif,TIFFTAG_SUBFILETYPE,value);
+						}
+					}
+					break;
 /* END REV 4.0 COMPATIBILITY */
-			default:
-				(void) TIFFFetchNormalTag(tif, dp, TRUE);
-				break;
-		}
-	}
+				default:
+					(void) TIFFFetchNormalTag(tif, dp, TRUE);
+					break;
+				}
+			} /* -- if (!dp->tdir_ignore) */
+		} /* -- for-loop -- */
+
+        if( tif->tif_mode == O_RDWR &&
+            tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 )
+        {
+            /* Directory typically created with TIFFDeferStrileArrayWriting() */
+            TIFFSetupStrips(tif);
+        }
+        else if( !(tif->tif_flags&TIFF_DEFERSTRILELOAD) )
+        {
+            if( tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 )
+            {
+                if (!TIFFFetchStripThing(tif,&(tif->tif_dir.td_stripoffset_entry),
+                                         tif->tif_dir.td_nstrips,
+                                         &tif->tif_dir.td_stripoffset_p))
+                {
+                    goto bad;
+                }
+            }
+            if( tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 )
+            {
+                if (!TIFFFetchStripThing(tif,&(tif->tif_dir.td_stripbytecount_entry),
+                                         tif->tif_dir.td_nstrips,
+                                         &tif->tif_dir.td_stripbytecount_p))
+                {
+                    goto bad;
+                }
+            }
+        }
+
 	/*
 	 * OJPEG hack:
 	 * - If a) compression is OJPEG, and b) photometric tag is missing,
@@ -4129,33 +4220,10 @@ TIFFReadDirectory(TIFF* tif)
 				"\"StripByteCounts\" field, calculating from imagelength");
 			if (EstimateStripByteCounts(tif, dir, dircount) < 0)
 			    goto bad;
-		/*
-		 * Assume we have wrong StripByteCount value (in case
-		 * of single strip) in following cases:
-		 *   - it is equal to zero along with StripOffset;
-		 *   - it is larger than file itself (in case of uncompressed
-		 *     image);
-		 *   - it is smaller than the size of the bytes per row
-		 *     multiplied on the number of rows.  The last case should
-		 *     not be checked in the case of writing new image,
-		 *     because we may do not know the exact strip size
-		 *     until the whole image will be written and directory
-		 *     dumped out.
-		 */
-		#define	BYTECOUNTLOOKSBAD \
-		    ( (tif->tif_dir.td_stripbytecount[0] == 0 && tif->tif_dir.td_stripoffset[0] != 0) || \
-		      (tif->tif_dir.td_compression == COMPRESSION_NONE && \
-		       (tif->tif_dir.td_stripoffset[0] <= TIFFGetFileSize(tif) && \
-		        tif->tif_dir.td_stripbytecount[0] > TIFFGetFileSize(tif) - tif->tif_dir.td_stripoffset[0])) || \
-		      (tif->tif_mode == O_RDONLY && \
-		       tif->tif_dir.td_compression == COMPRESSION_NONE && \
-		       tif->tif_dir.td_stripbytecount[0] < TIFFScanlineSize64(tif) * tif->tif_dir.td_imagelength) )
 
 		} else if (tif->tif_dir.td_nstrips == 1
                            && !(tif->tif_flags&TIFF_ISTILED)
-                           && _TIFFFillStriles(tif)
-			   && tif->tif_dir.td_stripoffset[0] != 0
-			   && BYTECOUNTLOOKSBAD) {
+			   && ByteCountLooksBad(tif)) {
 			/*
 			 * XXX: Plexus (and others) sometimes give a value of
 			 * zero for a tag when they don't know what the
@@ -4167,13 +4235,13 @@ TIFFReadDirectory(TIFF* tif)
 			if(EstimateStripByteCounts(tif, dir, dircount) < 0)
 			    goto bad;
 
-#if !defined(DEFER_STRILE_LOAD)
-		} else if (tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG
+		} else if (!(tif->tif_flags&TIFF_DEFERSTRILELOAD)
+			   && tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG
 			   && tif->tif_dir.td_nstrips > 2
 			   && tif->tif_dir.td_compression == COMPRESSION_NONE
-			   && tif->tif_dir.td_stripbytecount[0] != tif->tif_dir.td_stripbytecount[1]
-			   && tif->tif_dir.td_stripbytecount[0] != 0
-			   && tif->tif_dir.td_stripbytecount[1] != 0 ) {
+			   && TIFFGetStrileByteCount(tif, 0) != TIFFGetStrileByteCount(tif, 1)
+			   && TIFFGetStrileByteCount(tif, 0) != 0
+			   && TIFFGetStrileByteCount(tif, 1) != 0 ) {
 			/*
 			 * XXX: Some vendors fill StripByteCount array with
 			 * absolutely wrong values (it can be equal to
@@ -4188,7 +4256,6 @@ TIFFReadDirectory(TIFF* tif)
 			    "Wrong \"StripByteCounts\" field, ignoring and calculating from imagelength");
 			if (EstimateStripByteCounts(tif, dir, dircount) < 0)
 			    goto bad;
-#endif /* !defined(DEFER_STRILE_LOAD) */                        
 		}
 	}
 	if (dir)
@@ -4203,26 +4270,27 @@ TIFFReadDirectory(TIFF* tif)
 		else
 			tif->tif_dir.td_maxsamplevalue = (uint16)((1L<<tif->tif_dir.td_bitspersample)-1);
 	}
+
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
 	/*
 	 * XXX: We can optimize checking for the strip bounds using the sorted
 	 * bytecounts array. See also comments for TIFFAppendToStrip()
 	 * function in tif_write.c.
 	 */
-#if !defined(DEFER_STRILE_LOAD)        
-	if (tif->tif_dir.td_nstrips > 1) {
+	if (!(tif->tif_flags&TIFF_DEFERSTRILELOAD) && tif->tif_dir.td_nstrips > 1) {
 		uint32 strip;
 
 		tif->tif_dir.td_stripbytecountsorted = 1;
 		for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++) {
-			if (tif->tif_dir.td_stripoffset[strip - 1] >
-			    tif->tif_dir.td_stripoffset[strip]) {
+			if (TIFFGetStrileOffset(tif, strip - 1) >
+			    TIFFGetStrileOffset(tif, strip)) {
 				tif->tif_dir.td_stripbytecountsorted = 0;
 				break;
 			}
 		}
 	}
-#endif /* !defined(DEFER_STRILE_LOAD) */
-        
+#endif
+
 	/*
 	 * An opportunity for compression mode dependent tag fixup
 	 */
@@ -4241,13 +4309,11 @@ TIFFReadDirectory(TIFF* tif)
 	    (tif->tif_dir.td_nstrips==1)&&
 	    (tif->tif_dir.td_compression==COMPRESSION_NONE)&&  
 	    ((tif->tif_flags&(TIFF_STRIPCHOP|TIFF_ISTILED))==TIFF_STRIPCHOP))
-    {
-        if ( !_TIFFFillStriles(tif) || !tif->tif_dir.td_stripbytecount )
-            return 0;
-		ChopUpSingleUncompressedStrip(tif);
-    }
+        {
+            ChopUpSingleUncompressedStrip(tif);
+        }
 
-        /* There are also uncompressed stripped files with strips larger than */
+        /* There are also uncompressed striped files with strips larger than */
         /* 2 GB, which make them unfriendly with a lot of code. If possible, */
         /* try to expose smaller "virtual" strips. */
         if( tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
@@ -4255,8 +4321,6 @@ TIFFReadDirectory(TIFF* tif)
             (tif->tif_flags&(TIFF_STRIPCHOP|TIFF_ISTILED)) == TIFF_STRIPCHOP &&
             TIFFStripSize64(tif) > 0x7FFFFFFFUL )
         {
-            if ( !_TIFFFillStriles(tif) || !tif->tif_dir.td_stripbytecount )
-                return 0;
             TryChopUpUncompressedBigTiff(tif);
         }
 
@@ -4384,6 +4448,7 @@ TIFFReadCustomDirectory(TIFF* tif, toff_t diroff,
 	uint16 di;
 	const TIFFField* fip;
 	uint32 fii;
+        (*tif->tif_cleanup)(tif);   /* cleanup any previous compression state */
 	_TIFFSetupFields(tif, infoarray);
 	dircount=TIFFFetchDirectory(tif,diroff,&dir,NULL);
 	if (!dircount)
@@ -4410,17 +4475,17 @@ TIFFReadCustomDirectory(TIFF* tif, toff_t diroff,
 				TIFFWarningExt(tif->tif_clientdata, module,
 				    "Registering anonymous field with tag %d (0x%x) failed",
 				    dp->tdir_tag, dp->tdir_tag);
-				dp->tdir_tag=IGNORE;
+				dp->tdir_ignore = TRUE;
 			} else {
 				TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
 				assert( fii != FAILED_FII );
 			}
 		}
-		if (dp->tdir_tag!=IGNORE)
+		if (!dp->tdir_ignore)
 		{
 			fip=tif->tif_fields[fii];
 			if (fip->field_bit==FIELD_IGNORE)
-				dp->tdir_tag=IGNORE;
+				dp->tdir_ignore = TRUE;
 			else
 			{
 				/* check data type */
@@ -4440,7 +4505,7 @@ TIFFReadCustomDirectory(TIFF* tif, toff_t diroff,
 					TIFFWarningExt(tif->tif_clientdata, module,
 					    "Wrong data type %d for \"%s\"; tag ignored",
 					    dp->tdir_type,fip->field_name);
-					dp->tdir_tag=IGNORE;
+					dp->tdir_ignore = TRUE;
 				}
 				else
 				{
@@ -4454,21 +4519,21 @@ TIFFReadCustomDirectory(TIFF* tif, toff_t diroff,
 						else
 							expected=(uint32)fip->field_readcount;
 						if (!CheckDirCount(tif,dp,expected))
-							dp->tdir_tag=IGNORE;
+							dp->tdir_ignore = TRUE;
 					}
 				}
 			}
-			switch (dp->tdir_tag)
-			{
-				case IGNORE:
-					break;
-				case EXIFTAG_SUBJECTDISTANCE:
-					(void) TIFFFetchSubjectDistance(tif,dp);
-					break;
-				default:
-					(void) TIFFFetchNormalTag(tif, dp, TRUE);
-					break;
-			}
+			if (!dp->tdir_ignore) {
+				switch (dp->tdir_tag) 
+				{
+					case EXIFTAG_SUBJECTDISTANCE:
+						(void)TIFFFetchSubjectDistance(tif, dp);
+						break;
+					default:
+						(void)TIFFFetchNormalTag(tif, dp, TRUE);
+						break;
+				}
+			} /*-- if (!dp->tdir_ignore) */
 		}
 	}
 	if (dir)
@@ -4488,6 +4553,17 @@ TIFFReadEXIFDirectory(TIFF* tif, toff_t diroff)
 	return TIFFReadCustomDirectory(tif, diroff, exifFieldArray);  
 }
 
+/*
+ *--: EXIF-GPS custom directory reading as another special case of custom IFD.
+ */
+int
+TIFFReadGPSDirectory(TIFF* tif, toff_t diroff)
+{
+	const TIFFFieldArray* gpsFieldArray;
+	gpsFieldArray = _TIFFGetGpsFields();
+	return TIFFReadCustomDirectory(tif, diroff, gpsFieldArray);  
+}
+
 static int
 EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
 {
@@ -4501,12 +4577,12 @@ EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
         if( !_TIFFFillStrilesInternal( tif, 0 ) )
             return -1;
 
-	if (td->td_stripbytecount)
-		_TIFFfree(td->td_stripbytecount);
-	td->td_stripbytecount = (uint64*)
+	if (td->td_stripbytecount_p)
+		_TIFFfree(td->td_stripbytecount_p);
+	td->td_stripbytecount_p = (uint64*)
 	    _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
 		"for \"StripByteCounts\" array");
-        if( td->td_stripbytecount == NULL )
+        if( td->td_stripbytecount_p == NULL )
             return -1;
 
 	if (td->td_compression != COMPRESSION_NONE) {
@@ -4530,6 +4606,8 @@ EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
 				    dp->tdir_type);
 				return -1;
 			}
+			if( dp->tdir_count > TIFF_UINT64_MAX / typewidth )
+                            return -1;
 			datasize=(uint64)typewidth*dp->tdir_count;
 			if (!(tif->tif_flags&TIFF_BIGTIFF))
 			{
@@ -4541,6 +4619,8 @@ EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
 				if (datasize<=8)
 					datasize=0;
 			}
+			if( space > TIFF_UINT64_MAX - datasize )
+                            return -1;
 			space+=datasize;
 		}
 		if( filesize < space )
@@ -4551,7 +4631,7 @@ EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
 		if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
 			space /= td->td_samplesperpixel;
 		for (strip = 0; strip < td->td_nstrips; strip++)
-			td->td_stripbytecount[strip] = space;
+			td->td_stripbytecount_p[strip] = space;
 		/*
 		 * This gross hack handles the case were the offset to
 		 * the last strip is past the place where we think the strip
@@ -4560,18 +4640,30 @@ EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
 		 * of data in the strip and trim this number back accordingly.
 		 */
 		strip--;
-		if (td->td_stripoffset[strip]+td->td_stripbytecount[strip] > filesize)
-			td->td_stripbytecount[strip] = filesize - td->td_stripoffset[strip];
+                if (td->td_stripoffset_p[strip] > TIFF_UINT64_MAX - td->td_stripbytecount_p[strip])
+                    return -1;
+		if (td->td_stripoffset_p[strip]+td->td_stripbytecount_p[strip] > filesize) {
+                    if( td->td_stripoffset_p[strip] >= filesize ) {
+                        /* Not sure what we should in that case... */
+                        td->td_stripbytecount_p[strip] = 0;
+                    } else {
+                        td->td_stripbytecount_p[strip] = filesize - td->td_stripoffset_p[strip];
+                    }
+                }
 	} else if (isTiled(tif)) {
 		uint64 bytespertile = TIFFTileSize64(tif);
 
 		for (strip = 0; strip < td->td_nstrips; strip++)
-		    td->td_stripbytecount[strip] = bytespertile;
+		    td->td_stripbytecount_p[strip] = bytespertile;
 	} else {
 		uint64 rowbytes = TIFFScanlineSize64(tif);
 		uint32 rowsperstrip = td->td_imagelength/td->td_stripsperimage;
 		for (strip = 0; strip < td->td_nstrips; strip++)
-			td->td_stripbytecount[strip] = rowbytes * rowsperstrip;
+                {
+                    if( rowbytes > 0 && rowsperstrip > TIFF_UINT64_MAX / rowbytes )
+                        return -1;
+                    td->td_stripbytecount_p[strip] = rowbytes * rowsperstrip;
+                }
 	}
 	TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
 	if (!TIFFFieldSet(tif, FIELD_ROWSPERSTRIP))
@@ -4765,12 +4857,13 @@ TIFFFetchDirectory(TIFF* tif, uint64 diroff, TIFFDirEntry** pdir,
 		}
 	} else {
 		tmsize_t m;
-		tmsize_t off = (tmsize_t) tif->tif_diroff;
-		if ((uint64)off!=tif->tif_diroff)
+		tmsize_t off;
+		if (tif->tif_diroff > (uint64)TIFF_INT64_MAX)
 		{
 			TIFFErrorExt(tif->tif_clientdata,module,"Can not read TIFF directory count");
 			return(0);
 		}
+		off = (tmsize_t) tif->tif_diroff;
 
 		/*
 		 * Check for integer overflow when validating the dir_off,
@@ -4888,6 +4981,7 @@ TIFFFetchDirectory(TIFF* tif, uint64 diroff, TIFFDirEntry** pdir,
 	mb=dir;
 	for (n=0; n<dircount16; n++)
 	{
+		mb->tdir_ignore = FALSE;
 		if (tif->tif_flags&TIFF_SWAB)
 			TIFFSwabShort((uint16*)ma);
 		mb->tdir_tag=*(uint16*)ma;
@@ -4902,6 +4996,7 @@ TIFFFetchDirectory(TIFF* tif, uint64 diroff, TIFFDirEntry** pdir,
 				TIFFSwabLong((uint32*)ma);
 			mb->tdir_count=(uint64)(*(uint32*)ma);
 			ma+=sizeof(uint32);
+			mb->tdir_offset.toff_long8=0;
 			*(uint32*)(&mb->tdir_offset)=*(uint32*)ma;
 			ma+=sizeof(uint32);
 		}
@@ -5104,6 +5199,7 @@ TIFFFetchNormalTag(TIFF* tif, TIFFDirEntry* dp, int recover)
 				if (err==TIFFReadDirEntryErrOk)
 				{
 					int m;
+                                        assert(data); /* avoid CLang static Analyzer false positive */
 					m=TIFFSetField(tif,dp->tdir_tag,data[0],data[1]);
 					_TIFFfree(data);
 					if (!m)
@@ -5187,7 +5283,7 @@ TIFFFetchNormalTag(TIFF* tif, TIFFDirEntry* dp, int recover)
 				assert(fip->field_readcount>=1);
 				assert(fip->field_passcount==0);
 				if (dp->tdir_count!=(uint64)fip->field_readcount)
-                                    /* corrupt file */;
+					/* corrupt file */;
 				else
 				{
 					err=TIFFReadDirEntryFloatArray(tif,dp,&data);
@@ -5203,6 +5299,29 @@ TIFFFetchNormalTag(TIFF* tif, TIFFDirEntry* dp, int recover)
 				}
 			}
 			break;
+		/*--: Rational2Double: Extend for Double Arrays and Rational-Arrays read into Double-Arrays. */
+		case TIFF_SETGET_C0_DOUBLE:
+			{
+				double* data;
+				assert(fip->field_readcount>=1);
+				assert(fip->field_passcount==0);
+				if (dp->tdir_count!=(uint64)fip->field_readcount)
+					/* corrupt file */;
+				else
+				{
+					err=TIFFReadDirEntryDoubleArray(tif,dp,&data);
+					if (err==TIFFReadDirEntryErrOk)
+					{
+						int m;
+						m=TIFFSetField(tif,dp->tdir_tag,data);
+						if (data!=0)
+							_TIFFfree(data);
+						if (!m)
+							return(0);
+					}
+				}
+			}
+			break;
 		case TIFF_SETGET_C16_ASCII:
 			{
 				uint8* data;
@@ -5695,7 +5814,7 @@ TIFFFetchSubjectDistance(TIFF* tif, TIFFDirEntry* dir)
 			TIFFSwabArrayOfLong(m.i,2);
 		if (m.i[0]==0)
 			n=0.0;
-		else if (m.i[0]==0xFFFFFFFF)
+		else if (m.i[0]==0xFFFFFFFF || m.i[1]==0)
 			/*
 			 * XXX: Numerator 0xFFFFFFFF means that we have infinite
 			 * distance. Indicate that with a negative floating point
@@ -5719,10 +5838,22 @@ static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
     TIFFDirectory *td = &tif->tif_dir;
     uint64 bytecount;
     uint64 offset;
+    uint64 last_offset;
+    uint64 last_bytecount;
     uint32 i;
     uint64 *newcounts;
     uint64 *newoffsets;
 
+    offset = TIFFGetStrileOffset(tif, 0);
+    last_offset = TIFFGetStrileOffset(tif, td->td_nstrips-1);
+    last_bytecount = TIFFGetStrileByteCount(tif, td->td_nstrips-1);
+    if( last_offset > TIFF_UINT64_MAX - last_bytecount ||
+        last_offset + last_bytecount < offset )
+    {
+        return;
+    }
+    bytecount = last_offset + last_bytecount - offset;
+
     newcounts = (uint64*) _TIFFCheckMalloc(tif, nstrips, sizeof (uint64),
                             "for chopped \"StripByteCounts\" array");
     newoffsets = (uint64*) _TIFFCheckMalloc(tif, nstrips, sizeof (uint64),
@@ -5743,9 +5874,6 @@ static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
      * Fill the strip information arrays with new bytecounts and offsets
      * that reflect the broken-up format.
      */
-    offset = td->td_stripoffset[0];
-    bytecount = td->td_stripoffset[td->td_nstrips-1] +
-                td->td_stripbytecount[td->td_nstrips-1] - offset;
     for (i = 0; i < nstrips; i++)
     {
         if (stripbytes > bytecount)
@@ -5762,11 +5890,14 @@ static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
     td->td_stripsperimage = td->td_nstrips = nstrips;
     TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
 
-    _TIFFfree(td->td_stripbytecount);
-    _TIFFfree(td->td_stripoffset);
-    td->td_stripbytecount = newcounts;
-    td->td_stripoffset = newoffsets;
+    _TIFFfree(td->td_stripbytecount_p);
+    _TIFFfree(td->td_stripoffset_p);
+    td->td_stripbytecount_p = newcounts;
+    td->td_stripoffset_p = newoffsets;
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
     td->td_stripbytecountsorted = 1;
+#endif
+    tif->tif_flags |= TIFF_CHOPPEDUPARRAYS;
 }
 
 
@@ -5788,13 +5919,13 @@ ChopUpSingleUncompressedStrip(TIFF* tif)
 	uint32 nstrips;
 	uint32 rowsperstrip;
 
-	bytecount = td->td_stripbytecount[0];
+	bytecount = TIFFGetStrileByteCount(tif, 0);
         /* On a newly created file, just re-opened to be filled, we */
         /* don't want strip chop to trigger as it is going to cause issues */
         /* later ( StripOffsets and StripByteCounts improperly filled) . */
         if( bytecount == 0 && tif->tif_mode != O_RDONLY )
             return;
-	offset = td->td_stripoffset[0];
+	offset = TIFFGetStrileByteCount(tif, 0);
 	assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
 	if ((td->td_photometric == PHOTOMETRIC_YCBCR)&&
 	    (!isUpSampled(tif)))
@@ -5869,7 +6000,7 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
     /* On a newly created file, just re-opened to be filled, we */
     /* don't want strip chop to trigger as it is going to cause issues */
     /* later ( StripOffsets and StripByteCounts improperly filled) . */
-    if( td->td_stripbytecount[0] == 0 && tif->tif_mode != O_RDONLY )
+    if( TIFFGetStrileByteCount(tif, 0) == 0 && tif->tif_mode != O_RDONLY )
         return;
 
     if ((td->td_photometric == PHOTOMETRIC_YCBCR)&&
@@ -5889,7 +6020,7 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
     {
         if( i == td->td_nstrips - 1 )
         {
-            if( td->td_stripbytecount[i] < TIFFVStripSize64(
+            if( TIFFGetStrileByteCount(tif, i) < TIFFVStripSize64(
                     tif, td->td_imagelength - i * td->td_rowsperstrip ) )
             {
                 return;
@@ -5897,12 +6028,12 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
         }
         else
         {
-            if( td->td_stripbytecount[i] != stripsize )
+            if( TIFFGetStrileByteCount(tif, i) != stripsize )
             {
                 return;
             }
-            if( i > 0 && td->td_stripoffset[i] !=
-                    td->td_stripoffset[i-1] + td->td_stripbytecount[i - 1] )
+            if( i > 0 && TIFFGetStrileOffset(tif, i) !=
+                    TIFFGetStrileOffset(tif, i-1) + TIFFGetStrileByteCount(tif, i-1) )
             {
                 return;
             }
@@ -5924,18 +6055,367 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
     /* If we are going to allocate a lot of memory, make sure that the */
     /* file is as big as needed */
     if( tif->tif_mode == O_RDONLY &&
-        nstrips > 1000000 &&
-        (td->td_stripoffset[td->td_nstrips-1] > TIFFGetFileSize(tif) ||
-         td->td_stripoffset[td->td_nstrips-1] +
-         td->td_stripbytecount[td->td_nstrips-1] > TIFFGetFileSize(tif)) )
+        nstrips > 1000000 )
     {
-        return;
+        uint64 last_offset = TIFFGetStrileOffset(tif, td->td_nstrips-1);
+        uint64 filesize = TIFFGetFileSize(tif);
+        uint64 last_bytecount = TIFFGetStrileByteCount(tif, td->td_nstrips-1);
+        if( last_offset > filesize ||
+            last_bytecount > filesize - last_offset )
+        {
+            return;
+        }
     }
 
     allocChoppedUpStripArrays(tif, nstrips, stripbytes, rowsperstrip);
 }
 
 
+TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static uint64 _TIFFUnsanitizedAddUInt64AndInt(uint64 a, int b)
+{
+    return a + b;
+}
+
+/* Read the value of [Strip|Tile]Offset or [Strip|Tile]ByteCount around
+ * strip/tile of number strile. Also fetch the neighbouring values using a
+ * 4096 byte page size.
+ */
+static
+int _TIFFPartialReadStripArray( TIFF* tif, TIFFDirEntry* dirent,
+                                int strile, uint64* panVals )
+{
+    static const char module[] = "_TIFFPartialReadStripArray";
+#define IO_CACHE_PAGE_SIZE 4096
+
+    size_t sizeofval;
+    const int bSwab = (tif->tif_flags & TIFF_SWAB) != 0;
+    int sizeofvalint;
+    uint64 nBaseOffset;
+    uint64 nOffset;
+    uint64 nOffsetStartPage;
+    uint64 nOffsetEndPage;
+    tmsize_t nToRead;
+    tmsize_t nRead;
+    uint64 nLastStripOffset;
+    int iStartBefore;
+    int i;
+    const uint32 arraySize = tif->tif_dir.td_stripoffsetbyteallocsize;
+    unsigned char buffer[2 * IO_CACHE_PAGE_SIZE];
+
+    assert( dirent->tdir_count > 4 );
+
+    if( dirent->tdir_type == TIFF_SHORT )
+    {
+        sizeofval = sizeof(uint16);
+    }
+    else if( dirent->tdir_type == TIFF_LONG )
+    {
+        sizeofval = sizeof(uint32);
+    }
+    else if( dirent->tdir_type == TIFF_LONG8 )
+    {
+        sizeofval = sizeof(uint64);
+    }
+    else if( dirent->tdir_type == TIFF_SLONG8 )
+    {
+        /* Non conformant but used by some images as in */
+        /* https://github.com/OSGeo/gdal/issues/2165 */
+        sizeofval = sizeof(int64);
+    }
+    else
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                 "Invalid type for [Strip|Tile][Offset/ByteCount] tag");
+        panVals[strile] = 0;
+        return 0;
+    }
+    sizeofvalint = (int)(sizeofval);
+
+    if( tif->tif_flags&TIFF_BIGTIFF )
+    {
+        uint64 offset = dirent->tdir_offset.toff_long8;
+        if( bSwab )
+            TIFFSwabLong8(&offset);
+        nBaseOffset = offset;
+    }
+    else
+    {
+        uint32 offset = dirent->tdir_offset.toff_long;
+        if( bSwab )
+            TIFFSwabLong(&offset);
+        nBaseOffset = offset;
+    }
+    /* To avoid later unsigned integer overflows */
+    if( nBaseOffset > (uint64)TIFF_INT64_MAX )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                 "Cannot read offset/size for strile %d", strile);
+        panVals[strile] = 0;
+        return 0;
+    }
+    nOffset = nBaseOffset + sizeofval * strile;
+    nOffsetStartPage =
+        (nOffset / IO_CACHE_PAGE_SIZE) * IO_CACHE_PAGE_SIZE;
+    nOffsetEndPage = nOffsetStartPage + IO_CACHE_PAGE_SIZE;
+
+    if( nOffset + sizeofval > nOffsetEndPage )
+        nOffsetEndPage += IO_CACHE_PAGE_SIZE;
+#undef IO_CACHE_PAGE_SIZE
+
+    nLastStripOffset = nBaseOffset + arraySize * sizeofval;
+    if( nLastStripOffset < nOffsetEndPage )
+        nOffsetEndPage = nLastStripOffset;
+    if( nOffsetStartPage >= nOffsetEndPage )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                 "Cannot read offset/size for strile %d", strile);
+        panVals[strile] = 0;
+        return 0;
+    }
+    if (!SeekOK(tif,nOffsetStartPage))
+    {
+        panVals[strile] = 0;
+        return 0;
+    }
+
+    nToRead = (tmsize_t)(nOffsetEndPage - nOffsetStartPage);
+    nRead = TIFFReadFile(tif, buffer, nToRead);
+    if( nRead < nToRead )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                 "Cannot read offset/size for strile around ~%d", strile);
+        return 0;
+    }
+    iStartBefore = -(int)((nOffset - nOffsetStartPage) / sizeofval);
+    if( strile + iStartBefore < 0 )
+        iStartBefore = -strile;
+    for( i = iStartBefore;
+         (uint32)(strile + i) < arraySize &&
+         _TIFFUnsanitizedAddUInt64AndInt(nOffset, (i + 1) * sizeofvalint) <= nOffsetEndPage;
+         ++i )
+    {
+        if( dirent->tdir_type == TIFF_SHORT )
+        {
+            uint16 val;
+            memcpy(&val,
+                   buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
+                   sizeof(val));
+            if( bSwab )
+                TIFFSwabShort(&val);
+            panVals[strile + i] = val;
+        }
+        else if( dirent->tdir_type == TIFF_LONG )
+        {
+            uint32 val;
+            memcpy(&val,
+                   buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
+                   sizeof(val));
+            if( bSwab )
+                TIFFSwabLong(&val);
+            panVals[strile + i] = val;
+        }
+        else if( dirent->tdir_type == TIFF_LONG8 )
+        {
+            uint64 val;
+            memcpy(&val,
+                   buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
+                   sizeof(val));
+            if( bSwab )
+                TIFFSwabLong8(&val);
+            panVals[strile + i] = val;
+        }
+        else /* if( dirent->tdir_type == TIFF_SLONG8 ) */
+        {
+            /* Non conformant data type */
+            int64 val;
+            memcpy(&val,
+                   buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
+                   sizeof(val));
+            if( bSwab )
+                TIFFSwabLong8((uint64*) &val);
+            panVals[strile + i] = (uint64) val;
+        }
+    }
+    return 1;
+}
+
+static int _TIFFFetchStrileValue(TIFF* tif,
+                                 uint32 strile,
+                                 TIFFDirEntry* dirent,
+                                 uint64** parray)
+{
+    static const char module[] = "_TIFFFetchStrileValue";
+    TIFFDirectory *td = &tif->tif_dir;
+    if( strile >= dirent->tdir_count )
+    {
+        return 0;
+    }
+    if( strile >= td->td_stripoffsetbyteallocsize )
+    {
+        uint32 nStripArrayAllocBefore = td->td_stripoffsetbyteallocsize;
+        uint32 nStripArrayAllocNew;
+        uint64 nArraySize64;
+        size_t nArraySize;
+        uint64* offsetArray;
+        uint64* bytecountArray;
+
+        if( strile > 1000000 )
+        {
+            uint64 filesize = TIFFGetFileSize(tif);
+            /* Avoid excessive memory allocation attempt */
+            /* For such a big blockid we need at least a TIFF_LONG per strile */
+            /* for the offset array. */
+            if( strile > filesize / sizeof(uint32) )
+            {
+                TIFFErrorExt(tif->tif_clientdata, module, "File too short");
+                return 0;
+            }
+        }
+
+        if( td->td_stripoffsetbyteallocsize == 0 &&
+            td->td_nstrips < 1024 * 1024 )
+        {
+            nStripArrayAllocNew = td->td_nstrips;
+        }
+        else
+        {
+#define TIFF_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define TIFF_MIN(a,b) (((a)<(b)) ? (a) : (b))
+            nStripArrayAllocNew = TIFF_MAX(strile + 1, 1024U * 512U );
+            if( nStripArrayAllocNew < 0xFFFFFFFFU / 2  )
+                nStripArrayAllocNew *= 2;
+            nStripArrayAllocNew = TIFF_MIN(nStripArrayAllocNew, td->td_nstrips);
+        }
+        assert( strile < nStripArrayAllocNew );
+        nArraySize64 = (uint64)sizeof(uint64) * nStripArrayAllocNew;
+        nArraySize = (size_t)(nArraySize64);
+#if SIZEOF_SIZE_T == 4
+        if( nArraySize != nArraySize64 )
+        {
+            TIFFErrorExt(tif->tif_clientdata, module,
+                        "Cannot allocate strip offset and bytecount arrays");
+            return 0;
+        }
+#endif
+        offsetArray = (uint64*)(
+            _TIFFrealloc( td->td_stripoffset_p, nArraySize ) );
+        bytecountArray = (uint64*)(
+            _TIFFrealloc( td->td_stripbytecount_p, nArraySize ) );
+        if( offsetArray )
+            td->td_stripoffset_p = offsetArray;
+        if( bytecountArray )
+            td->td_stripbytecount_p = bytecountArray;
+        if( offsetArray && bytecountArray )
+        {
+            td->td_stripoffsetbyteallocsize = nStripArrayAllocNew;
+            /* Initialize new entries to ~0 / -1 */
+            memset(td->td_stripoffset_p + nStripArrayAllocBefore,
+                0xFF,
+                (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) * sizeof(uint64) );
+            memset(td->td_stripbytecount_p + nStripArrayAllocBefore,
+                0xFF,
+                (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) * sizeof(uint64) );
+        }
+        else
+        {
+            TIFFErrorExt(tif->tif_clientdata, module,
+                        "Cannot allocate strip offset and bytecount arrays");
+            _TIFFfree(td->td_stripoffset_p);
+            td->td_stripoffset_p = NULL;
+            _TIFFfree(td->td_stripbytecount_p);
+            td->td_stripbytecount_p = NULL;
+            td->td_stripoffsetbyteallocsize = 0;
+        }
+    }
+    if( *parray == NULL || strile >= td->td_stripoffsetbyteallocsize )
+        return 0;
+
+    if( ~((*parray)[strile]) == 0 )
+    {
+        if( !_TIFFPartialReadStripArray( tif, dirent, strile, *parray ) )
+        {
+            (*parray)[strile] = 0;
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static uint64 _TIFFGetStrileOffsetOrByteCountValue(TIFF *tif, uint32 strile,
+                                                   TIFFDirEntry* dirent,
+                                                   uint64** parray,
+                                                   int *pbErr)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    if( pbErr )
+        *pbErr = 0;
+    if( (tif->tif_flags&TIFF_DEFERSTRILELOAD) && !(tif->tif_flags&TIFF_CHOPPEDUPARRAYS) )
+    {
+        if( !(tif->tif_flags&TIFF_LAZYSTRILELOAD) ||
+            /* If the values may fit in the toff_long/toff_long8 member */
+            /* then use _TIFFFillStriles to simplify _TIFFFetchStrileValue */
+            dirent->tdir_count <= 4 )
+        {
+            if( !_TIFFFillStriles(tif) )
+            {
+                if( pbErr )
+                    *pbErr = 1;
+                /* Do not return, as we want this function to always */
+                /* return the same value if called several times with */
+                /* the same arguments */
+            }
+        }
+        else
+        {
+             if( !_TIFFFetchStrileValue(tif, strile, dirent, parray) )
+             {
+                if( pbErr )
+                    *pbErr = 1;
+                 return 0;
+             }
+        }
+    }
+    if( *parray == NULL || strile >= td->td_nstrips )
+    {
+        if( pbErr )
+            *pbErr = 1;
+        return 0;
+    }
+    return (*parray)[strile];
+}
+
+/* Return the value of the TileOffsets/StripOffsets array for the specified tile/strile */
+uint64 TIFFGetStrileOffset(TIFF *tif, uint32 strile)
+{
+    return TIFFGetStrileOffsetWithErr(tif, strile, NULL);
+}
+
+/* Return the value of the TileOffsets/StripOffsets array for the specified tile/strile */
+uint64 TIFFGetStrileOffsetWithErr(TIFF *tif, uint32 strile, int *pbErr)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    return _TIFFGetStrileOffsetOrByteCountValue(tif, strile,
+                               &(td->td_stripoffset_entry),
+                               &(td->td_stripoffset_p), pbErr);
+}
+
+/* Return the value of the TileByteCounts/StripByteCounts array for the specified tile/strile */
+uint64 TIFFGetStrileByteCount(TIFF *tif, uint32 strile)
+{
+    return TIFFGetStrileByteCountWithErr(tif, strile, NULL);
+}
+
+/* Return the value of the TileByteCounts/StripByteCounts array for the specified tile/strile */
+uint64 TIFFGetStrileByteCountWithErr(TIFF *tif, uint32 strile, int *pbErr)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    return _TIFFGetStrileOffsetOrByteCountValue(tif, strile,
+                               &(td->td_stripbytecount_entry),
+                               &(td->td_stripbytecount_p), pbErr);
+}
+
 
 int _TIFFFillStriles( TIFF *tif )
 {
@@ -5944,51 +6424,64 @@ int _TIFFFillStriles( TIFF *tif )
 
 static int _TIFFFillStrilesInternal( TIFF *tif, int loadStripByteCount )
 {
-#if defined(DEFER_STRILE_LOAD)
-        register TIFFDirectory *td = &tif->tif_dir;
-        int return_value = 1;
+    register TIFFDirectory *td = &tif->tif_dir;
+    int return_value = 1;
 
-        if( td->td_stripoffset != NULL )
-                return 1;
-
-        if( td->td_stripoffset_entry.tdir_count == 0 )
-                return 0;
-
-        if (!TIFFFetchStripThing(tif,&(td->td_stripoffset_entry),
-                                 td->td_nstrips,&td->td_stripoffset))
-        {
-                return_value = 0;
-        }
-
-        if (loadStripByteCount &&
-            !TIFFFetchStripThing(tif,&(td->td_stripbytecount_entry),
-                                 td->td_nstrips,&td->td_stripbytecount))
-        {
-                return_value = 0;
-        }
-
-        _TIFFmemset( &(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
-        _TIFFmemset( &(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
-
-	if (tif->tif_dir.td_nstrips > 1 && return_value == 1 ) {
-		uint32 strip;
-
-		tif->tif_dir.td_stripbytecountsorted = 1;
-		for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++) {
-			if (tif->tif_dir.td_stripoffset[strip - 1] >
-			    tif->tif_dir.td_stripoffset[strip]) {
-				tif->tif_dir.td_stripbytecountsorted = 0;
-				break;
-			}
-		}
-	}
-
-        return return_value;
-#else /* !defined(DEFER_STRILE_LOAD) */
-        (void) tif;
-        (void) loadStripByteCount;
+    /* Do not do anything if TIFF_DEFERSTRILELOAD is not set */
+    if( !(tif->tif_flags&TIFF_DEFERSTRILELOAD) || (tif->tif_flags&TIFF_CHOPPEDUPARRAYS) != 0 )
         return 1;
-#endif 
+
+    if( tif->tif_flags&TIFF_LAZYSTRILELOAD )
+    {
+        /* In case of lazy loading, reload completely the arrays */
+        _TIFFfree(td->td_stripoffset_p);
+        _TIFFfree(td->td_stripbytecount_p);
+        td->td_stripoffset_p = NULL;
+        td->td_stripbytecount_p = NULL;
+        td->td_stripoffsetbyteallocsize = 0;
+        tif->tif_flags &= ~TIFF_LAZYSTRILELOAD;
+    }
+
+    /* If stripoffset array is already loaded, exit with success */
+    if( td->td_stripoffset_p != NULL )
+            return 1;
+
+    /* If tdir_count was canceled, then we already got there, but in error */
+    if( td->td_stripoffset_entry.tdir_count == 0 )
+            return 0;
+
+    if (!TIFFFetchStripThing(tif,&(td->td_stripoffset_entry),
+                                td->td_nstrips,&td->td_stripoffset_p))
+    {
+            return_value = 0;
+    }
+
+    if (loadStripByteCount &&
+        !TIFFFetchStripThing(tif,&(td->td_stripbytecount_entry),
+                                td->td_nstrips,&td->td_stripbytecount_p))
+    {
+            return_value = 0;
+    }
+
+    _TIFFmemset( &(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
+    _TIFFmemset( &(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
+
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
+    if (tif->tif_dir.td_nstrips > 1 && return_value == 1 ) {
+            uint32 strip;
+
+            tif->tif_dir.td_stripbytecountsorted = 1;
+            for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++) {
+                    if (tif->tif_dir.td_stripoffset_p[strip - 1] >
+                        tif->tif_dir.td_stripoffset_p[strip]) {
+                            tif->tif_dir.td_stripbytecountsorted = 0;
+                            break;
+                    }
+            }
+    }
+#endif
+
+    return return_value;
 }
 
 
diff --git a/3rdparty/libtiff/tif_dirwrite.c b/3rdparty/libtiff/tif_dirwrite.c
index 83c01b24f2..f481250e3b 100644
--- a/3rdparty/libtiff/tif_dirwrite.c
+++ b/3rdparty/libtiff/tif_dirwrite.c
@@ -28,6 +28,8 @@
  * Directory Write Support Routines.
  */
 #include "tiffiop.h"
+#include <float.h>		/*--: for Rational2Double */
+#include <math.h>		/*--: for Rational2Double */
 
 #ifdef HAVE_IEEEFP
 #define TIFFCvtNativeToIEEEFloat(tif, n, fp)
@@ -154,6 +156,19 @@ static int TIFFWriteDirectoryTagCheckedSlong8Array(TIFF* tif, uint32* ndir, TIFF
 static int TIFFWriteDirectoryTagCheckedRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
 static int TIFFWriteDirectoryTagCheckedRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
 static int TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
+
+/*--: Rational2Double: New functions to support true double-precision for custom rational tag types. */
+static int TIFFWriteDirectoryTagRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
+static int TIFFWriteDirectoryTagSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
+static int TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
+static int TIFFWriteDirectoryTagCheckedSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
+static void DoubleToRational(double value, uint32 *num, uint32 *denom);
+static void DoubleToSrational(double value, int32 *num, int32 *denom);
+#if 0
+static void DoubleToRational_direct(double value, unsigned long *num, unsigned long *denom);
+static void DoubleToSrational_direct(double value, long *num, long *denom);
+#endif
+
 #ifdef notdef
 static int TIFFWriteDirectoryTagCheckedFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value);
 #endif
@@ -181,6 +196,51 @@ TIFFWriteDirectory(TIFF* tif)
 	return TIFFWriteDirectorySec(tif,TRUE,TRUE,NULL);
 }
 
+/*
+ * This is an advanced writing function that must be used in a particular
+ * sequence, and generally together with TIFFForceStrileArrayWriting(),
+ * to make its intended effect. Its aim is to modify the location
+ * where the [Strip/Tile][Offsets/ByteCounts] arrays are located in the file.
+ * More precisely, when TIFFWriteCheck() will be called, the tag entries for
+ * those arrays will be written with type = count = offset = 0 as a temporary
+ * value.
+ *
+ * Its effect is only valid for the current directory, and before
+ * TIFFWriteDirectory() is first called, and  will be reset when
+ * changing directory.
+ *
+ * The typical sequence of calls is:
+ * TIFFOpen()
+ * [ TIFFCreateDirectory(tif) ]
+ * Set fields with calls to TIFFSetField(tif, ...)
+ * TIFFDeferStrileArrayWriting(tif)
+ * TIFFWriteCheck(tif, ...)
+ * TIFFWriteDirectory(tif)
+ * ... potentially create other directories and come back to the above directory
+ * TIFFForceStrileArrayWriting(tif): emit the arrays at the end of file
+ *
+ * Returns 1 in case of success, 0 otherwise.
+ */
+int TIFFDeferStrileArrayWriting(TIFF* tif)
+{
+    static const char module[] = "TIFFDeferStrileArrayWriting";
+    if (tif->tif_mode == O_RDONLY)
+    {
+        TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
+                     "File opened in read-only mode");
+        return 0;
+    }
+    if( tif->tif_diroff != 0 )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                     "Directory has already been written");
+        return 0;
+    }
+
+    tif->tif_dir.td_deferstrilearraywriting = TRUE;
+    return 1;
+}
+
 /*
  * Similar to TIFFWriteDirectory(), writes the directory out
  * but leaves all data structures in memory so that it can be
@@ -192,7 +252,7 @@ TIFFCheckpointDirectory(TIFF* tif)
 {
 	int rc;
 	/* Setup the strips arrays, if they haven't already been. */
-	if (tif->tif_dir.td_stripoffset == NULL)
+	if (tif->tif_dir.td_stripoffset_p == NULL)
 	    (void) TIFFSetupStrips(tif);
 	rc = TIFFWriteDirectorySec(tif,TRUE,FALSE,NULL);
 	(void) TIFFSetWriteOffset(tif, TIFFSeekFile(tif, 0, SEEK_END));
@@ -527,12 +587,12 @@ TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
 			{
 				if (!isTiled(tif))
 				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount))
+					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount_p))
 						goto bad;
 				}
 				else
 				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount))
+					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount_p))
 						goto bad;
 				}
 			}
@@ -540,7 +600,7 @@ TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
 			{
 				if (!isTiled(tif))
 				{
-                    /* td_stripoffset might be NULL in an odd OJPEG case. See
+                    /* td_stripoffset_p might be NULL in an odd OJPEG case. See
                      *  tif_dirread.c around line 3634.
                      * XXX: OJPEG hack.
                      * If a) compression is OJPEG, b) it's not a tiled TIFF,
@@ -551,13 +611,13 @@ TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
                      * We can get here when using tiffset on such a file.
                      * See http://bugzilla.maptools.org/show_bug.cgi?id=2500
                     */
-                    if (tif->tif_dir.td_stripoffset != NULL &&
-                        !TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset))
+                    if (tif->tif_dir.td_stripoffset_p != NULL &&
+                        !TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset_p))
                         goto bad;
 				}
 				else
 				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset))
+					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset_p))
 						goto bad;
 				}
 			}
@@ -751,12 +811,42 @@ TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
 						goto bad;
 					break;
 				case TIFF_RATIONAL:
-					if (!TIFFWriteDirectoryTagRationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
+					{
+						/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
+						int tv_size;
+						tv_size = _TIFFSetGetFieldSize(tif->tif_dir.td_customValues[m].info->set_field_type);
+						if (tv_size == 8) {
+							if (!TIFFWriteDirectoryTagRationalDoubleArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
+								goto bad;
+						} else {
+							/*-- default should be tv_size == 4 */
+							if (!TIFFWriteDirectoryTagRationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
+								goto bad;
+							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
+							if (tv_size != 4) {
+								TIFFErrorExt(0,"TIFFLib: _TIFFWriteDirectorySec()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
+							}
+						}
+					}
 					break;
 				case TIFF_SRATIONAL:
-					if (!TIFFWriteDirectoryTagSrationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
+					{
+						/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
+						int tv_size;
+						tv_size = _TIFFSetGetFieldSize(tif->tif_dir.td_customValues[m].info->set_field_type);
+						if (tv_size == 8) {
+							if (!TIFFWriteDirectoryTagSrationalDoubleArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
+								goto bad;
+						} else {
+							/*-- default should be tv_size == 4 */
+							if (!TIFFWriteDirectoryTagSrationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
+								goto bad;
+							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
+							if (tv_size != 4) {
+								TIFFErrorExt(0,"TIFFLib: _TIFFWriteDirectorySec()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
+							}
+						}
+					}
 					break;
 				case TIFF_FLOAT:
 					if (!TIFFWriteDirectoryTagFloatArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
@@ -1515,6 +1605,29 @@ TIFFWriteDirectoryTagSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir,
 	return(TIFFWriteDirectoryTagCheckedSrationalArray(tif,ndir,dir,tag,count,value));
 }
 
+/*-- Rational2Double: additional write functions */
+static int
+TIFFWriteDirectoryTagRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+{
+	if (dir==NULL)
+	{
+		(*ndir)++;
+		return(1);
+	}
+	return(TIFFWriteDirectoryTagCheckedRationalDoubleArray(tif,ndir,dir,tag,count,value));
+}
+
+static int
+TIFFWriteDirectoryTagSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+{
+	if (dir==NULL)
+	{
+		(*ndir)++;
+		return(1);
+	}
+	return(TIFFWriteDirectoryTagCheckedSrationalDoubleArray(tif,ndir,dir,tag,count,value));
+}
+
 #ifdef notdef
 static int TIFFWriteDirectoryTagFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value)
 {
@@ -1651,22 +1764,52 @@ TIFFWriteDirectoryTagShortLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint1
 		return(TIFFWriteDirectoryTagCheckedLong(tif,ndir,dir,tag,value));
 }
 
+static int _WriteAsType(TIFF* tif, uint64 strile_size, uint64 uncompressed_threshold)
+{
+    const uint16 compression = tif->tif_dir.td_compression;
+    if ( compression == COMPRESSION_NONE )
+    {
+        return strile_size > uncompressed_threshold;
+    }
+    else if ( compression == COMPRESSION_JPEG ||
+              compression == COMPRESSION_LZW ||
+              compression == COMPRESSION_ADOBE_DEFLATE ||
+              compression == COMPRESSION_LZMA ||
+              compression == COMPRESSION_LERC ||
+              compression == COMPRESSION_ZSTD ||
+              compression == COMPRESSION_WEBP )
+    {
+        /* For a few select compression types, we assume that in the worst */
+        /* case the compressed size will be 10 times the uncompressed size */
+        /* This is overly pessismistic ! */
+        return strile_size >= uncompressed_threshold / 10;
+    }
+    return 1;
+}
+
+static int WriteAsLong8(TIFF* tif, uint64 strile_size)
+{
+    return _WriteAsType(tif, strile_size, 0xFFFFFFFFU);
+}
+
+static int WriteAsLong4(TIFF* tif, uint64 strile_size)
+{
+    return _WriteAsType(tif, strile_size, 0xFFFFU);
+}
+
 /************************************************************************/
 /*                TIFFWriteDirectoryTagLongLong8Array()                 */
 /*                                                                      */
-/*      Write out LONG8 array as LONG8 for BigTIFF or LONG for          */
-/*      Classic TIFF with some checking.                                */
+/*      Write out LONG8 array and write a SHORT/LONG/LONG8 depending    */
+/*      on strile size and Classic/BigTIFF mode.                        */
 /************************************************************************/
 
 static int
 TIFFWriteDirectoryTagLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
 {
     static const char module[] = "TIFFWriteDirectoryTagLongLong8Array";
-    uint64* ma;
-    uint32 mb;
-    uint32* p;
-    uint32* q;
     int o;
+    int write_aslong4;
 
     /* is this just a counting pass? */
     if (dir==NULL)
@@ -1675,37 +1818,105 @@ TIFFWriteDirectoryTagLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir,
         return(1);
     }
 
-    /* We always write LONG8 for BigTIFF, no checking needed. */
-    if( tif->tif_flags&TIFF_BIGTIFF )
-        return TIFFWriteDirectoryTagCheckedLong8Array(tif,ndir,dir,
-                                                      tag,count,value);
-
-    /*
-    ** For classic tiff we want to verify everything is in range for LONG
-    ** and convert to long format.
-    */
-
-    p = _TIFFmalloc(count*sizeof(uint32));
-    if (p==NULL)
+    if( tif->tif_dir.td_deferstrilearraywriting )
     {
-        TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-        return(0);
+        return TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_NOTYPE, 0, 0, NULL);
     }
 
-    for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+    if( tif->tif_flags&TIFF_BIGTIFF )
     {
-        if (*ma>0xFFFFFFFF)
+        int write_aslong8 = 1;
+        /* In the case of ByteCounts array, we may be able to write them on */
+        /* LONG if the strip/tilesize is not too big. */
+        /* Also do that for count > 1 in the case someone would want to create */
+        /* a single-strip file with a growing height, in which case using */
+        /* LONG8 will be safer. */
+        if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
         {
-            TIFFErrorExt(tif->tif_clientdata,module,
-                         "Attempt to write value larger than 0xFFFFFFFF in Classic TIFF file.");
-            _TIFFfree(p);
+            write_aslong8 = WriteAsLong8(tif, TIFFStripSize64(tif));
+        }
+        else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+        {
+            write_aslong8 = WriteAsLong8(tif, TIFFTileSize64(tif));
+        }
+        if( write_aslong8 )
+        {
+            return TIFFWriteDirectoryTagCheckedLong8Array(tif,ndir,dir,
+                                                        tag,count,value);
+        }
+    }
+
+    write_aslong4 = 1;
+    if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+    {
+        write_aslong4 = WriteAsLong4(tif, TIFFStripSize64(tif));
+    }
+    else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+    {
+        write_aslong4 = WriteAsLong4(tif, TIFFTileSize64(tif));
+    }
+    if( write_aslong4 )
+    {
+        /*
+        ** For classic tiff we want to verify everything is in range for LONG
+        ** and convert to long format.
+        */
+
+        uint32* p = _TIFFmalloc(count*sizeof(uint32));
+        uint32* q;
+        uint64* ma;
+        uint32 mb;
+
+        if (p==NULL)
+        {
+            TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
             return(0);
         }
-        *q= (uint32)(*ma);
-    }
 
-    o=TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,count,p);
-    _TIFFfree(p);
+        for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+        {
+            if (*ma>0xFFFFFFFF)
+            {
+                TIFFErrorExt(tif->tif_clientdata,module,
+                            "Attempt to write value larger than 0xFFFFFFFF in LONG array.");
+                _TIFFfree(p);
+                return(0);
+            }
+            *q= (uint32)(*ma);
+        }
+
+        o=TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,count,p);
+        _TIFFfree(p);
+    }
+    else
+    {
+        uint16* p = _TIFFmalloc(count*sizeof(uint16));
+        uint16* q;
+        uint64* ma;
+        uint32 mb;
+
+        if (p==NULL)
+        {
+            TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
+            return(0);
+        }
+
+        for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+        {
+            if (*ma>0xFFFF)
+            {
+                /* Should not happen normally given the check we did before */
+                TIFFErrorExt(tif->tif_clientdata,module,
+                            "Attempt to write value larger than 0xFFFF in SHORT array.");
+                _TIFFfree(p);
+                return(0);
+            }
+            *q= (uint16)(*ma);
+        }
+
+        o=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,tag,count,p);
+        _TIFFfree(p);
+    }
 
     return(o);
 }
@@ -2175,19 +2386,20 @@ TIFFWriteDirectoryTagCheckedSlong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* d
 static int
 TIFFWriteDirectoryTagCheckedRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
 {
-        static const char module[] = "TIFFWriteDirectoryTagCheckedRational";
+	static const char module[] = "TIFFWriteDirectoryTagCheckedRational";
 	uint32 m[2];
 	assert(sizeof(uint32)==4);
-        if( value < 0 )
-        {
-            TIFFErrorExt(tif->tif_clientdata,module,"Negative value is illegal");
-            return 0;
-        }
-        else if( value != value )
-        {
-            TIFFErrorExt(tif->tif_clientdata,module,"Not-a-number value is illegal");
-            return 0;
-        }
+	if (value < 0) 
+	{
+		TIFFErrorExt(tif->tif_clientdata, module, "Negative value is illegal");
+		return 0;
+	} 
+	else if (value != value) 
+	{
+		TIFFErrorExt(tif->tif_clientdata, module, "Not-a-number value is illegal");
+		return 0;
+	}
+#ifdef not_def
 	else if (value==0.0)
 	{
 		m[0]=0;
@@ -2208,6 +2420,15 @@ TIFFWriteDirectoryTagCheckedRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir,
 		m[0]=0xFFFFFFFF;
 		m[1]=(uint32)(0xFFFFFFFF/value);
 	}
+#else
+	/*--Rational2Double: New function also used for non-custom rational tags. 
+	 *  However, could be omitted here, because TIFFWriteDirectoryTagCheckedRational() is not used by code for custom tags,
+	 *  only by code for named-tiff-tags like FIELD_RESOLUTION and FIELD_POSITION */
+	else {
+	DoubleToRational(value, &m[0], &m[1]);
+	}
+#endif
+
 	if (tif->tif_flags&TIFF_SWAB)
 	{
 		TIFFSwabLong(&m[0]);
@@ -2234,6 +2455,7 @@ TIFFWriteDirectoryTagCheckedRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry*
 	}
 	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
 	{
+#ifdef not_def
 		if (*na<=0.0 || *na != *na)
 		{
 			nb[0]=0;
@@ -2255,6 +2477,10 @@ TIFFWriteDirectoryTagCheckedRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry*
 			nb[0]=0xFFFFFFFF;
 			nb[1]=(uint32)((double)0xFFFFFFFF/(*na));
 		}
+#else
+		/*-- Rational2Double: Also for float precision accuracy is sometimes enhanced --*/
+		DoubleToRational(*na, &nb[0], &nb[1]);
+#endif
 	}
 	if (tif->tif_flags&TIFF_SWAB)
 		TIFFSwabArrayOfLong(m,count*2);
@@ -2281,6 +2507,7 @@ TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry
 	}
 	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
 	{
+#ifdef not_def
 		if (*na<0.0)
 		{
 			if (*na==(int32)(*na))
@@ -2317,6 +2544,10 @@ TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry
 				nb[1]=(int32)((double)0x7FFFFFFF/(*na));
 			}
 		}
+#else
+		/*-- Rational2Double: Also for float precision accuracy is sometimes enhanced --*/
+		DoubleToSrational(*na, &nb[0], &nb[1]);
+#endif
 	}
 	if (tif->tif_flags&TIFF_SWAB)
 		TIFFSwabArrayOfLong((uint32*)m,count*2);
@@ -2325,6 +2556,400 @@ TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry
 	return(o);
 }
 
+/*-- Rational2Double: additional write functions for double arrays */
+static int
+TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+{
+	static const char module[] = "TIFFWriteDirectoryTagCheckedRationalDoubleArray";
+	uint32* m;
+	double* na;
+	uint32* nb;
+	uint32 nc;
+	int o;
+	assert(sizeof(uint32)==4);
+	m=_TIFFmalloc(count*2*sizeof(uint32));
+	if (m==NULL)
+	{
+		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
+		return(0);
+	}
+	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
+	{
+		DoubleToRational(*na, &nb[0], &nb[1]);
+	}
+	if (tif->tif_flags&TIFF_SWAB)
+		TIFFSwabArrayOfLong(m,count*2);
+	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_RATIONAL,count,count*8,&m[0]);
+	_TIFFfree(m);
+	return(o);
+} /*-- TIFFWriteDirectoryTagCheckedRationalDoubleArray() ------- */
+
+static int
+TIFFWriteDirectoryTagCheckedSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+{
+	static const char module[] = "TIFFWriteDirectoryTagCheckedSrationalDoubleArray";
+	int32* m;
+	double* na;
+	int32* nb;
+	uint32 nc;
+	int o;
+	assert(sizeof(int32)==4);
+	m=_TIFFmalloc(count*2*sizeof(int32));
+	if (m==NULL)
+	{
+		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
+		return(0);
+	}
+	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
+	{
+		DoubleToSrational(*na, &nb[0], &nb[1]);
+	}
+	if (tif->tif_flags&TIFF_SWAB)
+		TIFFSwabArrayOfLong((uint32*)m,count*2);
+	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SRATIONAL,count,count*8,&m[0]);
+	_TIFFfree(m);
+	return(o);
+} /*--- TIFFWriteDirectoryTagCheckedSrationalDoubleArray() -------- */
+
+#if 0
+static
+void DoubleToRational_direct(double value, unsigned long *num, unsigned long *denom)
+{
+	/*--- OLD Code for debugging and comparison  ---- */
+	/* code merged from TIFFWriteDirectoryTagCheckedRationalArray() and TIFFWriteDirectoryTagCheckedRational() */
+
+	/* First check for zero and also check for negative numbers (which are illegal for RATIONAL) 
+	 * and also check for "not-a-number". In each case just set this to zero to support also rational-arrays.
+	  */
+	if (value<=0.0 || value != value)
+	{
+		*num=0;
+		*denom=1;
+	}
+	else if (value <= 0xFFFFFFFFU &&  (value==(double)(uint32)(value)))	/* check for integer values */
+	{
+		*num=(uint32)(value);
+		*denom=1;
+	}
+	else if (value<1.0)
+	{
+		*num = (uint32)((value) * (double)0xFFFFFFFFU);
+		*denom=0xFFFFFFFFU;
+	}
+	else
+	{
+		*num=0xFFFFFFFFU;
+		*denom=(uint32)((double)0xFFFFFFFFU/(value));
+	}
+}  /*-- DoubleToRational_direct() -------------- */
+#endif
+
+#if 0
+static
+void DoubleToSrational_direct(double value,  long *num,  long *denom)
+{
+	/*--- OLD Code for debugging and comparison -- SIGNED-version ----*/
+	/*  code was amended from original TIFFWriteDirectoryTagCheckedSrationalArray() */
+
+	/* First check for zero and also check for negative numbers (which are illegal for RATIONAL)
+	 * and also check for "not-a-number". In each case just set this to zero to support also rational-arrays.
+	  */
+	if (value<0.0)
+		{
+			if (value==(int32)(value))
+			{
+				*num=(int32)(value);
+				*denom=1;
+			}
+			else if (value>-1.0)
+			{
+				*num=-(int32)((-value) * (double)0x7FFFFFFF);
+				*denom=0x7FFFFFFF;
+			}
+			else
+			{
+				*num=-0x7FFFFFFF;
+				*denom=(int32)((double)0x7FFFFFFF / (-value));
+			}
+		}
+		else
+		{
+			if (value==(int32)(value))
+			{
+				*num=(int32)(value);
+				*denom=1;
+			}
+			else if (value<1.0)
+			{
+				*num=(int32)((value)  *(double)0x7FFFFFFF);
+				*denom=0x7FFFFFFF;
+			}
+			else
+			{
+				*num=0x7FFFFFFF;
+				*denom=(int32)((double)0x7FFFFFFF / (value));
+			}
+		}
+}  /*-- DoubleToSrational_direct() --------------*/
+#endif
+
+//#define DOUBLE2RAT_DEBUGOUTPUT
+/** -----  Rational2Double: Double To Rational Conversion ----------------------------------------------------------
+* There is a mathematical theorem to convert real numbers into a rational (integer fraction) number.
+* This is called "continuous fraction" which uses the Euclidean algorithm to find the greatest common divisor (GCD).
+*  (ref. e.g. https://de.wikipedia.org/wiki/Kettenbruch or https://en.wikipedia.org/wiki/Continued_fraction
+*             https://en.wikipedia.org/wiki/Euclidean_algorithm)
+* The following functions implement the
+* - ToRationalEuclideanGCD()		auxiliary function which mainly implements euclidean GCD
+* - DoubleToRational()			conversion function for un-signed rationals
+* - DoubleToSrational()			conversion function for signed rationals
+------------------------------------------------------------------------------------------------------------------*/
+
+/**---- ToRationalEuclideanGCD() -----------------------------------------
+* Calculates the rational fractional of a double input value
+* using the Euclidean algorithm to find the greatest common divisor (GCD)
+------------------------------------------------------------------------*/
+static
+void ToRationalEuclideanGCD(double value, int blnUseSignedRange, int blnUseSmallRange, unsigned long long *ullNum, unsigned long long *ullDenom)
+{
+	/* Internally, the integer variables can be bigger than the external ones,
+	* as long as the result will fit into the external variable size.
+	*/
+	unsigned long long val, numSum[3] = { 0, 1, 0 }, denomSum[3] = { 1, 0, 0 };
+	unsigned long long aux, bigNum, bigDenom;
+	unsigned long long returnLimit;
+	int i;
+	unsigned long long nMax;
+	double fMax;
+	unsigned long maxDenom;
+	/*-- nMax and fMax defines the initial accuracy of the starting fractional,
+	*   or better, the highest used integer numbers used within the starting fractional (bigNum/bigDenom).
+	*   There are two approaches, which can accidentally lead to different accuracies just depending on the value.
+	*   Therefore, blnUseSmallRange steers this behavior.
+	*   For long long nMax = ((9223372036854775807-1)/2); for long nMax = ((2147483647-1)/2);
+	*/
+	if (blnUseSmallRange) {
+		nMax = (unsigned long long)((2147483647 - 1) / 2); /* for ULONG range */
+	}
+	else {
+		nMax = ((9223372036854775807 - 1) / 2);				/* for ULLONG range */
+	}
+	fMax = (double)nMax;
+
+	/*-- For the Euclidean GCD define the denominator range, so that it stays within size of unsigned long variables.
+	*   maxDenom should be LONG_MAX for negative values and ULONG_MAX for positive ones.
+	*   Also the final returned value of ullNum and ullDenom is limited according to signed- or unsigned-range.
+	*/
+	if (blnUseSignedRange) {
+		maxDenom = 2147483647UL;  /*LONG_MAX = 0x7FFFFFFFUL*/
+		returnLimit = maxDenom;
+	}
+	else {
+		maxDenom = 0xFFFFFFFFUL;  /*ULONG_MAX = 0xFFFFFFFFUL*/
+		returnLimit = maxDenom;
+	}
+
+	/*-- First generate a rational fraction (bigNum/bigDenom) which represents the value
+	*   as a rational number with the highest accuracy. Therefore, unsigned long long (uint64) is needed.
+	*   This rational fraction is then reduced using the Euclidean algorithm to find the greatest common divisor (GCD).
+	*   bigNum   = big numinator of value without fraction (or cut residual fraction)
+	*   bigDenom = big denominator of value
+	*-- Break-criteria so that uint64 cast to "bigNum" introduces no error and bigDenom has no overflow,
+	*   and stop with enlargement of fraction when the double-value of it reaches an integer number without fractional part.
+	*/
+	bigDenom = 1;
+	while ((value != floor(value)) && (value < fMax) && (bigDenom < nMax)) {
+		bigDenom <<= 1;
+		value *= 2;
+	}
+	bigNum = (unsigned long long)value;
+
+	/*-- Start Euclidean algorithm to find the greatest common divisor (GCD) -- */
+#define MAX_ITERATIONS 64
+	for (i = 0; i < MAX_ITERATIONS; i++) {
+		/* if bigDenom is not zero, calculate integer part of fraction. */
+		if (bigDenom == 0) {
+			val = 0;
+			break;
+		}
+		else {
+			val = bigNum / bigDenom;
+		}
+
+		/* Set bigDenom to reminder of bigNum/bigDenom and bigNum to previous denominator bigDenom. */
+		aux = bigNum;
+		bigNum = bigDenom;
+		bigDenom = aux % bigDenom;
+
+		/* calculate next denominator and check for its given maximum */
+		aux = val;
+		if (denomSum[1] * val + denomSum[0] >= maxDenom) {
+			aux = (maxDenom - denomSum[0]) / denomSum[1];
+			if (aux * 2 >= val || denomSum[1] >= maxDenom)
+				i = (MAX_ITERATIONS + 1);			/* exit but execute rest of for-loop */
+			else
+				break;
+		}
+		/* calculate next numerator to numSum2 and save previous one to numSum0; numSum1 just copy of numSum2. */
+		numSum[2] = aux * numSum[1] + numSum[0];
+		numSum[0] = numSum[1];
+		numSum[1] = numSum[2];
+		/* calculate next denominator to denomSum2 and save previous one to denomSum0; denomSum1 just copy of denomSum2. */
+		denomSum[2] = aux * denomSum[1] + denomSum[0];
+		denomSum[0] = denomSum[1];
+		denomSum[1] = denomSum[2];
+	}
+
+	/*-- Check and adapt for final variable size and return values; reduces internal accuracy; denominator is kept in ULONG-range with maxDenom -- */
+	while (numSum[1] > returnLimit || denomSum[1] > returnLimit) {
+		numSum[1] = numSum[1] / 2;
+		denomSum[1] = denomSum[1] / 2;
+	}
+
+	/* return values */
+	*ullNum = numSum[1];
+	*ullDenom = denomSum[1];
+
+}  /*-- ToRationalEuclideanGCD() -------------- */
+
+
+/**---- DoubleToRational() -----------------------------------------------
+* Calculates the rational fractional of a double input value
+* for UN-SIGNED rationals,
+* using the Euclidean algorithm to find the greatest common divisor (GCD)
+------------------------------------------------------------------------*/
+static
+void DoubleToRational(double value, uint32 *num, uint32 *denom)
+{
+	/*---- UN-SIGNED RATIONAL ---- */
+	double dblDiff, dblDiff2;
+	unsigned long long ullNum, ullDenom, ullNum2, ullDenom2;
+
+	/*-- Check for negative values. If so it is an error. */
+        /* Test written that way to catch NaN */
+	if (!(value >= 0)) {
+		*num = *denom = 0;
+		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Negative Value for Unsigned Rational given.");
+		return;
+	}
+
+	/*-- Check for too big numbers (> ULONG_MAX) -- */
+	if (value > 0xFFFFFFFFUL) {
+		*num = 0xFFFFFFFFU;
+		*denom = 0;
+		return;
+	}
+	/*-- Check for easy integer numbers -- */
+	if (value == (uint32)(value)) {
+		*num = (uint32)value;
+		*denom = 1;
+		return;
+	}
+	/*-- Check for too small numbers for "unsigned long" type rationals -- */
+	if (value < 1.0 / (double)0xFFFFFFFFUL) {
+		*num = 0;
+		*denom = 0xFFFFFFFFU;
+		return;
+	}
+
+	/*-- There are two approaches using the Euclidean algorithm,
+	*   which can accidentally lead to different accuracies just depending on the value.
+	*   Try both and define which one was better.
+	*/
+	ToRationalEuclideanGCD(value, FALSE, FALSE, &ullNum, &ullDenom);
+	ToRationalEuclideanGCD(value, FALSE, TRUE, &ullNum2, &ullDenom2);
+	/*-- Double-Check, that returned values fit into ULONG :*/
+	if (ullNum > 0xFFFFFFFFUL || ullDenom > 0xFFFFFFFFUL || ullNum2 > 0xFFFFFFFFUL || ullDenom2 > 0xFFFFFFFFUL) {
+#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
+		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Num or Denom exceeds ULONG: val=%14.6f, num=%I64u, denom=%I64u | num2=%I64u, denom2=%I64u", value, ullNum, ullDenom, ullNum2, ullDenom2);
+#else
+		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Num or Denom exceeds ULONG: val=%14.6f, num=%12llu, denom=%12llu | num2=%12llu, denom2=%12llu", value, ullNum, ullDenom, ullNum2, ullDenom2);
+#endif
+		assert(0);
+	}
+
+	/* Check, which one has higher accuracy and take that. */
+	dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
+	dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
+	if (dblDiff < dblDiff2) {
+		*num = (uint32)ullNum;
+		*denom = (uint32)ullDenom;
+	}
+	else {
+		*num = (uint32)ullNum2;
+		*denom = (uint32)ullDenom2;
+	}
+}  /*-- DoubleToRational() -------------- */
+
+/**---- DoubleToSrational() -----------------------------------------------
+* Calculates the rational fractional of a double input value
+* for SIGNED rationals,
+* using the Euclidean algorithm to find the greatest common divisor (GCD)
+------------------------------------------------------------------------*/
+static
+void DoubleToSrational(double value, int32 *num, int32 *denom)
+{
+	/*---- SIGNED RATIONAL ----*/
+	int neg = 1;
+	double dblDiff, dblDiff2;
+	unsigned long long ullNum, ullDenom, ullNum2, ullDenom2;
+
+	/*-- Check for negative values and use then the positive one for internal calculations, but take the sign into account before returning. */
+	if (value < 0) { neg = -1; value = -value; }
+
+	/*-- Check for too big numbers (> LONG_MAX) -- */
+	if (value > 0x7FFFFFFFL) {
+		*num = 0x7FFFFFFFL;
+		*denom = 0;
+		return;
+	}
+	/*-- Check for easy numbers -- */
+	if (value == (int32)(value)) {
+		*num = (int32)(neg * value);
+		*denom = 1;
+		return;
+	}
+	/*-- Check for too small numbers for "long" type rationals -- */
+	if (value < 1.0 / (double)0x7FFFFFFFL) {
+		*num = 0;
+		*denom = 0x7FFFFFFFL;
+		return;
+	}
+
+	/*-- There are two approaches using the Euclidean algorithm,
+	*   which can accidentally lead to different accuracies just depending on the value.
+	*   Try both and define which one was better.
+	*   Furthermore, set behavior of ToRationalEuclideanGCD() to the range of signed-long.
+	*/
+	ToRationalEuclideanGCD(value, TRUE, FALSE, &ullNum, &ullDenom);
+	ToRationalEuclideanGCD(value, TRUE, TRUE, &ullNum2, &ullDenom2);
+	/*-- Double-Check, that returned values fit into LONG :*/
+	if (ullNum > 0x7FFFFFFFL || ullDenom > 0x7FFFFFFFL || ullNum2 > 0x7FFFFFFFL || ullDenom2 > 0x7FFFFFFFL) {
+#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
+		TIFFErrorExt(0, "TIFFLib: DoubleToSrational()", " Num or Denom exceeds LONG: val=%14.6f, num=%I64u, denom=%I64u | num2=%I64u, denom2=%I64u", neg*value, ullNum, ullDenom, ullNum2, ullDenom2);
+#else
+		TIFFErrorExt(0, "TIFFLib: DoubleToSrational()", " Num or Denom exceeds LONG: val=%14.6f, num=%12llu, denom=%12llu | num2=%12llu, denom2=%12llu", neg*value, ullNum, ullDenom, ullNum2, ullDenom2);
+#endif
+		assert(0);
+	}
+
+	/* Check, which one has higher accuracy and take that. */
+	dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
+	dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
+	if (dblDiff < dblDiff2) {
+		*num = (int32)(neg * (long)ullNum);
+		*denom = (int32)ullDenom;
+	}
+	else {
+		*num = (int32)(neg * (long)ullNum2);
+		*denom = (int32)ullDenom2;
+	}
+}  /*-- DoubleToSrational() --------------*/
+
+
+
+
+
 #ifdef notdef
 static int
 TIFFWriteDirectoryTagCheckedFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value)
@@ -2420,7 +3045,12 @@ TIFFWriteDirectoryTagData(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag
 	dir[m].tdir_count=count;
 	dir[m].tdir_offset.toff_long8 = 0;
 	if (datalength<=((tif->tif_flags&TIFF_BIGTIFF)?0x8U:0x4U))
-		_TIFFmemcpy(&dir[m].tdir_offset,data,datalength);
+        {
+            if( data && datalength )
+            {
+                _TIFFmemcpy(&dir[m].tdir_offset,data,datalength);
+            }
+        }
 	else
 	{
 		uint64 na,nb;
@@ -2812,13 +3442,60 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
             TIFFSwabLong8( &entry_offset );
     }
 
+/* -------------------------------------------------------------------- */
+/*      When a dummy tag was written due to TIFFDeferStrileArrayWriting() */
+/* -------------------------------------------------------------------- */
+    if( entry_offset == 0 && entry_count == 0 && entry_type == 0 )
+    {
+        if( tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS )
+        {
+            entry_type = (tif->tif_flags&TIFF_BIGTIFF) ? TIFF_LONG8 : TIFF_LONG; 
+        }
+        else
+        {
+            int write_aslong8 = 1;
+            if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+            {
+                write_aslong8 = WriteAsLong8(tif, TIFFStripSize64(tif));
+            }
+            else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+            {
+                write_aslong8 = WriteAsLong8(tif, TIFFTileSize64(tif));
+            }
+            if( write_aslong8 )
+            {
+                entry_type = TIFF_LONG8;
+            }
+            else
+            {
+                int write_aslong4 = 1;
+                if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+                {
+                    write_aslong4 = WriteAsLong4(tif, TIFFStripSize64(tif));
+                }
+                else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+                {
+                    write_aslong4 = WriteAsLong4(tif, TIFFTileSize64(tif));
+                }
+                if( write_aslong4 )
+                {
+                    entry_type = TIFF_LONG;
+                }
+                else
+                {
+                    entry_type = TIFF_SHORT;
+                }
+            }
+        }
+    }
+
 /* -------------------------------------------------------------------- */
 /*      What data type do we want to write this as?                     */
 /* -------------------------------------------------------------------- */
     if( TIFFDataWidth(in_datatype) == 8 && !(tif->tif_flags&TIFF_BIGTIFF) )
     {
         if( in_datatype == TIFF_LONG8 )
-            datatype = TIFF_LONG;
+            datatype = entry_type == TIFF_SHORT ? TIFF_SHORT : TIFF_LONG;
         else if( in_datatype == TIFF_SLONG8 )
             datatype = TIFF_SLONG;
         else if( in_datatype == TIFF_IFD8 )
@@ -2826,8 +3503,21 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
         else
             datatype = in_datatype;
     }
-    else 
-        datatype = in_datatype;
+    else
+    {
+        if( in_datatype == TIFF_LONG8 &&
+            (entry_type == TIFF_SHORT || entry_type == TIFF_LONG ||
+             entry_type == TIFF_LONG8 ) )
+            datatype = entry_type;
+        else if( in_datatype == TIFF_SLONG8 &&
+            (entry_type == TIFF_SLONG || entry_type == TIFF_SLONG8 ) )
+            datatype = entry_type;
+        else if( in_datatype == TIFF_IFD8 &&
+            (entry_type == TIFF_IFD || entry_type == TIFF_IFD8 ) )
+            datatype = entry_type;
+        else
+            datatype = in_datatype;
+    }
 
 /* -------------------------------------------------------------------- */
 /*      Prepare buffer of actual data to write.  This includes          */
@@ -2876,6 +3566,29 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
             }
         }
     }
+    else if( datatype == TIFF_SHORT && in_datatype == TIFF_LONG8 )
+    {
+	tmsize_t i;
+
+        for( i = 0; i < count; i++ )
+        {
+            ((uint16 *) buf_to_write)[i] =
+                (uint16) ((uint64 *) data)[i];
+            if( (uint64) ((uint16 *) buf_to_write)[i] != ((uint64 *) data)[i] )
+            {
+                _TIFFfree( buf_to_write );
+                TIFFErrorExt( tif->tif_clientdata, module,
+                              "Value exceeds 16bit range of output type." );
+                return 0;
+            }
+        }
+    }
+    else
+    {
+        TIFFErrorExt( tif->tif_clientdata, module,
+                      "Unhandled type conversion." );
+        return 0;
+    }
 
     if( TIFFDataWidth(datatype) > 1 && (tif->tif_flags&TIFF_SWAB) )
     {
@@ -2907,6 +3620,23 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
         }
     }
 
+    if( (tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS) &&
+        tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 )
+    {
+        tif->tif_dir.td_stripoffset_entry.tdir_type = datatype;
+        tif->tif_dir.td_stripoffset_entry.tdir_count = count;
+    }
+    else if( (tag == TIFFTAG_TILEBYTECOUNTS || tag == TIFFTAG_STRIPBYTECOUNTS) &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 )
+    {
+        tif->tif_dir.td_stripbytecount_entry.tdir_type = datatype;
+        tif->tif_dir.td_stripbytecount_entry.tdir_count = count;
+    }
+
 /* -------------------------------------------------------------------- */
 /*      If the tag type, and count match, then we just write it out     */
 /*      over the old values without altering the directory entry at     */
@@ -2958,6 +3688,7 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
 /*      Adjust the directory entry.                                     */
 /* -------------------------------------------------------------------- */
     entry_type = datatype;
+    entry_count = (uint64)count;
     memcpy( direntry_raw + 2, &entry_type, sizeof(uint16) );
     if (tif->tif_flags&TIFF_SWAB)
         TIFFSwabShort( (uint16 *) (direntry_raw + 2) );
diff --git a/3rdparty/libtiff/tif_fax3.c b/3rdparty/libtiff/tif_fax3.c
index d11c968444..9ab5b26ad3 100644
--- a/3rdparty/libtiff/tif_fax3.c
+++ b/3rdparty/libtiff/tif_fax3.c
@@ -73,6 +73,7 @@ typedef struct {
 	int	EOLcnt;			/* count of EOL codes recognized */
 	TIFFFaxFillFunc fill;		/* fill routine */
 	uint32*	runs;			/* b&w runs for current/previous row */
+	uint32	nruns;			/* size of the refruns / curruns arrays */
 	uint32*	refruns;		/* runs for reference line */
 	uint32*	curruns;		/* runs for current line */
 
@@ -160,7 +161,9 @@ Fax3PreDecode(TIFF* tif, uint16 s)
 	 */
 	sp->bitmap =
 	    TIFFGetBitRevTable(tif->tif_dir.td_fillorder != FILLORDER_LSB2MSB);
+	sp->curruns = sp->runs;
 	if (sp->refruns) {		/* init reference line to white */
+		sp->refruns = sp->runs + sp->nruns;
 		sp->refruns[0] = (uint32) sp->b.rowpixels;
 		sp->refruns[1] = 0;
 	}
@@ -218,8 +221,12 @@ Fax3PrematureEOF(const char* module, TIFF* tif, uint32 line, uint32 a0)
 
 #define	Nop
 
-/*
+/**
  * Decode the requested amount of G3 1D-encoded data.
+ * @param buf destination buffer
+ * @param occ available bytes in destination buffer
+ * @param s number of planes (ignored)
+ * @returns 1 for success, -1 in case of error
  */
 static int
 Fax3Decode1D(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
@@ -300,7 +307,9 @@ Fax3Decode2D(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
 		else
 			EXPAND2D(EOF2Da);
 		(*sp->fill)(buf, thisrun, pa, lastx);
-		SETVALUE(0);		/* imaginary change for reference */
+		if (pa < thisrun + sp->nruns) {
+			SETVALUE(0);	/* imaginary change for reference */
+		}
 		SWAP(uint32*, sp->curruns, sp->refruns);
 		buf += sp->b.rowbytes;
 		occ -= sp->b.rowbytes;
@@ -506,7 +515,7 @@ Fax3SetupState(TIFF* tif)
 	int needsRefLine;
 	Fax3CodecState* dsp = (Fax3CodecState*) Fax3State(tif);
 	tmsize_t rowbytes;
-	uint32 rowpixels, nruns;
+	uint32 rowpixels;
 
 	if (td->td_bitspersample != 1) {
 		TIFFErrorExt(tif->tif_clientdata, module,
@@ -523,6 +532,13 @@ Fax3SetupState(TIFF* tif)
 		rowbytes = TIFFScanlineSize(tif);
 		rowpixels = td->td_imagewidth;
 	}
+	if ((uint64)rowbytes < ((uint64)rowpixels + 7) / 8)
+	{
+		TIFFErrorExt(tif->tif_clientdata, module,
+			"Inconsistent number of bytes per row : rowbytes=%lu rowpixels=%lu",
+			(unsigned long)(rowbytes), (unsigned long)(rowpixels));
+		return (0);
+	}
 	sp->rowbytes = rowbytes;
 	sp->rowpixels = rowpixels;
 	/*
@@ -539,26 +555,26 @@ Fax3SetupState(TIFF* tif)
 	  TIFFroundup and TIFFSafeMultiply return zero on integer overflow
 	*/
 	dsp->runs=(uint32*) NULL;
-	nruns = TIFFroundup_32(rowpixels,32);
+	dsp->nruns = TIFFroundup_32(rowpixels,32);
 	if (needsRefLine) {
-		nruns = TIFFSafeMultiply(uint32,nruns,2);
+		dsp->nruns = TIFFSafeMultiply(uint32,dsp->nruns,2);
 	}
-	if ((nruns == 0) || (TIFFSafeMultiply(uint32,nruns,2) == 0)) {
+	if ((dsp->nruns == 0) || (TIFFSafeMultiply(uint32,dsp->nruns,2) == 0)) {
 		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
 			     "Row pixels integer overflow (rowpixels %u)",
 			     rowpixels);
 		return (0);
 	}
 	dsp->runs = (uint32*) _TIFFCheckMalloc(tif,
-					       TIFFSafeMultiply(uint32,nruns,2),
+					       TIFFSafeMultiply(uint32,dsp->nruns,2),
 					       sizeof (uint32),
 					       "for Group 3/4 run arrays");
 	if (dsp->runs == NULL)
 		return (0);
-	memset( dsp->runs, 0, TIFFSafeMultiply(uint32,nruns,2)*sizeof(uint32));
+	memset( dsp->runs, 0, TIFFSafeMultiply(uint32,dsp->nruns,2)*sizeof(uint32));
 	dsp->curruns = dsp->runs;
 	if (needsRefLine)
-		dsp->refruns = dsp->runs + nruns;
+		dsp->refruns = dsp->runs + dsp->nruns;
 	else
 		dsp->refruns = NULL;
 	if (td->td_compression == COMPRESSION_CCITTFAX3
@@ -594,15 +610,19 @@ Fax3SetupState(TIFF* tif)
  */
 
 #define	Fax3FlushBits(tif, sp) {				\
-	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize)		\
-		(void) TIFFFlushData1(tif);			\
+	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize) {	\
+		if( !TIFFFlushData1(tif) )			\
+			return 0;				\
+        }							\
 	*(tif)->tif_rawcp++ = (uint8) (sp)->data;		\
 	(tif)->tif_rawcc++;					\
 	(sp)->data = 0, (sp)->bit = 8;				\
 }
 #define	_FlushBits(tif) {					\
-	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize)		\
-		(void) TIFFFlushData1(tif);			\
+	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize) {	\
+		if( !TIFFFlushData1(tif) )			\
+			return 0;				\
+        }							\
 	*(tif)->tif_rawcp++ = (uint8) data;		\
 	(tif)->tif_rawcc++;					\
 	data = 0, bit = 8;					\
@@ -627,7 +647,7 @@ static const int _msbmask[9] =
  * the output stream.  Values are
  * assumed to be at most 16 bits.
  */
-static void
+static int
 Fax3PutBits(TIFF* tif, unsigned int bits, unsigned int length)
 {
 	Fax3CodecState* sp = EncoderState(tif);
@@ -638,6 +658,7 @@ Fax3PutBits(TIFF* tif, unsigned int bits, unsigned int length)
 
 	sp->data = data;
 	sp->bit = bit;
+        return 1;
 }
 
 /*
@@ -662,7 +683,7 @@ Fax3PutBits(TIFF* tif, unsigned int bits, unsigned int length)
  * appropriate table that holds the make-up and
  * terminating codes is supplied.
  */
-static void
+static int
 putspan(TIFF* tif, int32 span, const tableentry* tab)
 {
 	Fax3CodecState* sp = EncoderState(tif);
@@ -700,6 +721,8 @@ putspan(TIFF* tif, int32 span, const tableentry* tab)
 
 	sp->data = data;
 	sp->bit = bit;
+
+        return 1;
 }
 
 /*
@@ -708,7 +731,7 @@ putspan(TIFF* tif, int32 span, const tableentry* tab)
  * here.  We also handle writing the tag bit for the next
  * scanline when doing 2d encoding.
  */
-static void
+static int
 Fax3PutEOL(TIFF* tif)
 {
 	Fax3CodecState* sp = EncoderState(tif);
@@ -742,6 +765,8 @@ Fax3PutEOL(TIFF* tif)
 
 	sp->data = data;
 	sp->bit = bit;
+
+        return 1;
 }
 
 /*
@@ -991,12 +1016,14 @@ Fax3Encode1DRow(TIFF* tif, unsigned char* bp, uint32 bits)
 
 	for (;;) {
 		span = find0span(bp, bs, bits);		/* white span */
-		putspan(tif, span, TIFFFaxWhiteCodes);
+		if( !putspan(tif, span, TIFFFaxWhiteCodes) )
+                    return 0;
 		bs += span;
 		if (bs >= bits)
 			break;
 		span = find1span(bp, bs, bits);		/* black span */
-		putspan(tif, span, TIFFFaxBlackCodes);
+		if( !putspan(tif, span, TIFFFaxBlackCodes) )
+                    return 0;
 		bs += span;
 		if (bs >= bits)
 			break;
@@ -1048,21 +1075,28 @@ Fax3Encode2DRow(TIFF* tif, unsigned char* bp, unsigned char* rp, uint32 bits)
 			          (b1 < a1 && a1 - b1 <= 3U) ? -(int32)(a1 - b1) : 0x7FFFFFFF;
 			if (!(-3 <= d && d <= 3)) {	/* horizontal mode */
 				a2 = finddiff2(bp, a1, bits, PIXEL(bp,a1));
-				putcode(tif, &horizcode);
+				if( !putcode(tif, &horizcode) )
+                                    return 0;
 				if (a0+a1 == 0 || PIXEL(bp, a0) == 0) {
-					putspan(tif, a1-a0, TIFFFaxWhiteCodes);
-					putspan(tif, a2-a1, TIFFFaxBlackCodes);
+					if( !putspan(tif, a1-a0, TIFFFaxWhiteCodes) )
+                                            return 0;
+					if( !putspan(tif, a2-a1, TIFFFaxBlackCodes) )
+                                            return 0;
 				} else {
-					putspan(tif, a1-a0, TIFFFaxBlackCodes);
-					putspan(tif, a2-a1, TIFFFaxWhiteCodes);
+					if( !putspan(tif, a1-a0, TIFFFaxBlackCodes) )
+                                            return 0;
+					if( !putspan(tif, a2-a1, TIFFFaxWhiteCodes) )
+                                            return 0;
 				}
 				a0 = a2;
 			} else {			/* vertical mode */
-				putcode(tif, &vcodes[d+3]);
+				if( !putcode(tif, &vcodes[d+3]) )
+                                    return 0;
 				a0 = a1;
 			}
 		} else {				/* pass mode */
-			putcode(tif, &passcode);
+			if( !putcode(tif, &passcode) )
+                            return 0;
 			a0 = b2;
 		}
 		if (a0 >= bits)
@@ -1091,7 +1125,10 @@ Fax3Encode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 	}
 	while (cc > 0) {
 		if ((sp->b.mode & FAXMODE_NOEOL) == 0)
-			Fax3PutEOL(tif);
+                {
+			if( !Fax3PutEOL(tif) )
+                            return 0;
+                }
 		if (is2DEncoding(sp)) {
 			if (sp->tag == G3_1D) {
 				if (!Fax3Encode1DRow(tif, bp, sp->b.rowpixels))
@@ -1128,8 +1165,8 @@ Fax3PostEncode(TIFF* tif)
 	return (1);
 }
 
-static void
-Fax3Close(TIFF* tif)
+static int
+_Fax3Close(TIFF* tif)
 {
 	if ((Fax3State(tif)->mode & FAXMODE_NORTC) == 0 && tif->tif_rawcp) {
 		Fax3CodecState* sp = EncoderState(tif);
@@ -1145,6 +1182,13 @@ Fax3Close(TIFF* tif)
 			Fax3PutBits(tif, code, length);
 		Fax3FlushBits(tif, sp);
 	}
+	return 1;
+}
+
+static void
+Fax3Close(TIFF* tif)
+{
+    _Fax3Close(tif);
 }
 
 static void
@@ -1453,6 +1497,13 @@ Fax4Decode(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
 		EXPAND2D(EOFG4);
                 if (EOLcnt)
                     goto EOFG4;
+		if (((lastx + 7) >> 3) > (int)occ)	/* check for buffer overrun */
+		{
+			TIFFErrorExt(tif->tif_clientdata, module,
+			             "Buffer overrun detected : %d bytes available, %d bits needed",
+			             (int)occ, lastx);
+			return -1;
+		}
 		(*sp->fill)(buf, thisrun, pa, lastx);
 		SETVALUE(0);		/* imaginary change for reference */
 		SWAP(uint32*, sp->curruns, sp->refruns);
@@ -1468,6 +1519,13 @@ Fax4Decode(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
                     fputs( "Bad EOFB\n", stderr );
 #endif                
                 ClrBits( 13 );
+		if (((lastx + 7) >> 3) > (int)occ)	/* check for buffer overrun */
+		{
+			TIFFErrorExt(tif->tif_clientdata, module,
+			             "Buffer overrun detected : %d bytes available, %d bits needed",
+			             (int)occ, lastx);
+			return -1;
+		}
 		(*sp->fill)(buf, thisrun, pa, lastx);
 		UNCACHE_STATE(tif, sp);
 		return ( sp->line ? 1 : -1);	/* don't error on badly-terminated strips */
diff --git a/3rdparty/libtiff/tif_fax3.h b/3rdparty/libtiff/tif_fax3.h
index abadcd97a2..701716cc18 100644
--- a/3rdparty/libtiff/tif_fax3.h
+++ b/3rdparty/libtiff/tif_fax3.h
@@ -240,6 +240,11 @@ static const char* StateNames[] = {
  * current row and reset decoding state.
  */
 #define SETVALUE(x) do {							\
+    if (pa >= thisrun + sp->nruns) {					\
+        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+                    sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+        return (-1);							\
+    }									\
     *pa++ = RunLength + (x);						\
     a0 += (x);								\
     RunLength = 0;							\
@@ -377,6 +382,11 @@ done1d:									\
  */
 #define CHECK_b1 do {							\
     if (pa != thisrun) while (b1 <= a0 && b1 < lastx) {			\
+	if( pb + 1 >= sp->refruns + sp->nruns) { 			\
+	    TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+	    return (-1);						\
+	}								\
 	b1 += pb[0] + pb[1];						\
 	pb += 2;							\
     }									\
@@ -387,10 +397,20 @@ done1d:									\
  */
 #define EXPAND2D(eoflab) do {						\
     while (a0 < lastx) {						\
+	if (pa >= thisrun + sp->nruns) {				\
+		TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+		             sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+		return (-1);						\
+	}								\
 	LOOKUP8(7, TIFFFaxMainTable, eof2d);				\
 	switch (TabEnt->State) {					\
 	case S_Pass:							\
 	    CHECK_b1;							\
+	    if( pb + 1 >= sp->refruns + sp->nruns) { 			\
+	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+	        return (-1);						\
+	    }								\
 	    b1 += *pb++;						\
 	    RunLength += b1 - a0;					\
 	    a0 = b1;							\
@@ -469,20 +489,28 @@ done1d:									\
 	case S_V0:							\
 	    CHECK_b1;							\
 	    SETVALUE(b1 - a0);						\
+	    if( pb >= sp->refruns + sp->nruns) { 			\
+	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+	        return (-1);						\
+	    }								\
 	    b1 += *pb++;						\
 	    break;							\
 	case S_VR:							\
 	    CHECK_b1;							\
 	    SETVALUE(b1 - a0 + TabEnt->Param);				\
+	    if( pb >= sp->refruns + sp->nruns) { 			\
+	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
+	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
+	        return (-1);						\
+	    }								\
 	    b1 += *pb++;						\
 	    break;							\
 	case S_VL:							\
 	    CHECK_b1;							\
-	    if (b1 <= (int) (a0 + TabEnt->Param)) {			\
-		if (b1 < (int) (a0 + TabEnt->Param) || pa != thisrun) {	\
-		    unexpected("VL", a0);				\
-		    goto eol2d;						\
-		}							\
+	    if (b1 < (int) (a0 + TabEnt->Param)) {			\
+		unexpected("VL", a0);				\
+		goto eol2d;						\
 	    }								\
 	    SETVALUE(b1 - a0 - TabEnt->Param);				\
 	    b1 -= *--pb;						\
@@ -529,6 +557,7 @@ eol2d:									\
     CLEANUP_RUNS();							\
 } while (0)
 #endif /* _FAX3_ */
+/* vim: set ts=8 sts=4 sw=4 noet: */
 /*
  * Local Variables:
  * mode: c
diff --git a/3rdparty/libtiff/tif_fax3sm.c b/3rdparty/libtiff/tif_fax3sm.c
index 822191ecf4..ba2fc532e8 100644
--- a/3rdparty/libtiff/tif_fax3sm.c
+++ b/3rdparty/libtiff/tif_fax3sm.c
@@ -1,5 +1,6 @@
 /* WARNING, this file was automatically generated by the
     mkg3states program */
+#include <stdint.h>
 #include "tiff.h"
 #include "tif_fax3.h"
  const TIFFFaxTabEnt TIFFFaxMainTable[128] = {
diff --git a/3rdparty/libtiff/tif_flush.c b/3rdparty/libtiff/tif_flush.c
index 881fac5121..f7fa2072ab 100644
--- a/3rdparty/libtiff/tif_flush.c
+++ b/3rdparty/libtiff/tif_flush.c
@@ -45,36 +45,8 @@ TIFFFlush(TIFF* tif)
         && !(tif->tif_flags & TIFF_DIRTYDIRECT) 
         && tif->tif_mode == O_RDWR )
     {
-        uint64  *offsets=NULL, *sizes=NULL;
-
-        if( TIFFIsTiled(tif) )
-        {
-            if( TIFFGetField( tif, TIFFTAG_TILEOFFSETS, &offsets ) 
-                && TIFFGetField( tif, TIFFTAG_TILEBYTECOUNTS, &sizes ) 
-                && _TIFFRewriteField( tif, TIFFTAG_TILEOFFSETS, TIFF_LONG8, 
-                                      tif->tif_dir.td_nstrips, offsets )
-                && _TIFFRewriteField( tif, TIFFTAG_TILEBYTECOUNTS, TIFF_LONG8, 
-                                      tif->tif_dir.td_nstrips, sizes ) )
-            {
-                tif->tif_flags &= ~TIFF_DIRTYSTRIP;
-                tif->tif_flags &= ~TIFF_BEENWRITING;
-                return 1;
-            }
-        }
-        else
-        {
-            if( TIFFGetField( tif, TIFFTAG_STRIPOFFSETS, &offsets ) 
-                && TIFFGetField( tif, TIFFTAG_STRIPBYTECOUNTS, &sizes ) 
-                && _TIFFRewriteField( tif, TIFFTAG_STRIPOFFSETS, TIFF_LONG8, 
-                                      tif->tif_dir.td_nstrips, offsets )
-                && _TIFFRewriteField( tif, TIFFTAG_STRIPBYTECOUNTS, TIFF_LONG8, 
-                                      tif->tif_dir.td_nstrips, sizes ) )
-            {
-                tif->tif_flags &= ~TIFF_DIRTYSTRIP;
-                tif->tif_flags &= ~TIFF_BEENWRITING;
-                return 1;
-            }
-        }
+        if( TIFFForceStrileArrayWriting(tif) )
+            return 1;
     }
 
     if ((tif->tif_flags & (TIFF_DIRTYDIRECT|TIFF_DIRTYSTRIP)) 
@@ -84,6 +56,92 @@ TIFFFlush(TIFF* tif)
     return (1);
 }
 
+/*
+ * This is an advanced writing function that must be used in a particular
+ * sequence, and together with TIFFDeferStrileArrayWriting(),
+ * to make its intended effect. Its aim is to force the writing of
+ * the [Strip/Tile][Offsets/ByteCounts] arrays at the end of the file, when
+ * they have not yet been rewritten.
+ *
+ * The typical sequence of calls is:
+ * TIFFOpen()
+ * [ TIFFCreateDirectory(tif) ]
+ * Set fields with calls to TIFFSetField(tif, ...)
+ * TIFFDeferStrileArrayWriting(tif)
+ * TIFFWriteCheck(tif, ...)
+ * TIFFWriteDirectory(tif)
+ * ... potentially create other directories and come back to the above directory
+ * TIFFForceStrileArrayWriting(tif)
+ *
+ * Returns 1 in case of success, 0 otherwise.
+ */
+int TIFFForceStrileArrayWriting(TIFF* tif)
+{
+    static const char module[] = "TIFFForceStrileArrayWriting";
+    const int isTiled = TIFFIsTiled(tif);
+
+    if (tif->tif_mode == O_RDONLY)
+    {
+        TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
+                     "File opened in read-only mode");
+        return 0;
+    }
+    if( tif->tif_diroff == 0 )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                     "Directory has not yet been written");
+        return 0;
+    }
+    if( (tif->tif_flags & TIFF_DIRTYDIRECT) != 0 )
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                     "Directory has changes other than the strile arrays. "
+                     "TIFFRewriteDirectory() should be called instead");
+        return 0;
+    }
+
+    if( !(tif->tif_flags & TIFF_DIRTYSTRIP) )
+    {
+        if( !(tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+             tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+             tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+             tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0) )
+        {
+            TIFFErrorExt(tif->tif_clientdata, module,
+                        "Function not called together with "
+                        "TIFFDeferStrileArrayWriting()");
+            return 0;
+        }
+
+        if (tif->tif_dir.td_stripoffset_p == NULL && !TIFFSetupStrips(tif))
+            return 0;
+    }
+
+    if( _TIFFRewriteField( tif,
+                           isTiled ? TIFFTAG_TILEOFFSETS :
+                                     TIFFTAG_STRIPOFFSETS,
+                           TIFF_LONG8,
+                           tif->tif_dir.td_nstrips,
+                           tif->tif_dir.td_stripoffset_p )
+        && _TIFFRewriteField( tif,
+                              isTiled ? TIFFTAG_TILEBYTECOUNTS :
+                                        TIFFTAG_STRIPBYTECOUNTS,
+                              TIFF_LONG8,
+                              tif->tif_dir.td_nstrips,
+                              tif->tif_dir.td_stripbytecount_p ) )
+    {
+        tif->tif_flags &= ~TIFF_DIRTYSTRIP;
+        tif->tif_flags &= ~TIFF_BEENWRITING;
+        return 1;
+    }
+
+    return 0;
+}
+
 /*
  * Flush buffered data to the file.
  *
diff --git a/3rdparty/libtiff/tif_getimage.c b/3rdparty/libtiff/tif_getimage.c
index 6a9d5a7c0c..3460af744e 100644
--- a/3rdparty/libtiff/tif_getimage.c
+++ b/3rdparty/libtiff/tif_getimage.c
@@ -29,6 +29,7 @@
  */
 #include "tiffiop.h"
 #include <stdio.h>
+#include <limits.h>
 
 static int gtTileContig(TIFFRGBAImage*, uint32*, uint32, uint32);
 static int gtTileSeparate(TIFFRGBAImage*, uint32*, uint32, uint32);
@@ -645,12 +646,20 @@ gtTileContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 
     flip = setorientation(img);
     if (flip & FLIP_VERTICALLY) {
-	    y = h - 1;
-	    toskew = -(int32)(tw + w);
+        if ((tw + w) > INT_MAX) {
+            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+            return (0);
+        }
+        y = h - 1;
+        toskew = -(int32)(tw + w);
     }
     else {
-	    y = 0;
-	    toskew = -(int32)(tw - w);
+        if (tw > (INT_MAX + w)) {
+            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+            return (0);
+        }
+        y = 0;
+        toskew = -(int32)(tw - w);
     }
      
     /*
@@ -755,9 +764,8 @@ gtTileSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 	uint32 leftmost_tw;
 
 	tilesize = TIFFTileSize(tif);  
-	bufsize = TIFFSafeMultiply(tmsize_t,alpha?4:3,tilesize);
+	bufsize = _TIFFMultiplySSize(tif, alpha?4:3,tilesize, "gtTileSeparate");
 	if (bufsize == 0) {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in %s", "gtTileSeparate");
 		return (0);
 	}
 
@@ -766,10 +774,18 @@ gtTileSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 
 	flip = setorientation(img);
 	if (flip & FLIP_VERTICALLY) {
+		if ((tw + w) > INT_MAX) {
+            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+            return (0);
+        }
 		y = h - 1;
 		toskew = -(int32)(tw + w);
 	}
 	else {
+		if (tw > (INT_MAX + w)) {
+            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+            return (0);
+        }
 		y = 0;
 		toskew = -(int32)(tw - w);
 	}
@@ -937,6 +953,10 @@ gtStripContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 
 	flip = setorientation(img);
 	if (flip & FLIP_VERTICALLY) {
+		if ( w > INT_MAX ) {
+        	TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Width overflow");
+			return (0);
+		}
 		y = h - 1;
 		toskew = -(int32)(w + w);
 	} else {
@@ -950,16 +970,23 @@ gtStripContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 	fromskew = (w < imagewidth ? imagewidth - w : 0);
 	for (row = 0; row < h; row += nrow)
 	{
+		uint32 temp;
 		rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
 		nrow = (row + rowstoread > h ? h - row : rowstoread);
 		nrowsub = nrow;
 		if ((nrowsub%subsamplingver)!=0)
 			nrowsub+=subsamplingver-nrowsub%subsamplingver;
+		temp = (row + img->row_offset)%rowsperstrip + nrowsub;
+		if( scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline) )
+		{
+			TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in gtStripContig");
+			return 0;
+		}
 		if (_TIFFReadEncodedStripAndAllocBuffer(tif,
 		    TIFFComputeStrip(tif,row+img->row_offset, 0),
 		    (void**)(&buf),
                     maxstripsize,
-		    ((row + img->row_offset)%rowsperstrip + nrowsub) * scanline)==(tmsize_t)(-1)
+		    temp * scanline)==(tmsize_t)(-1)
 		    && (buf == NULL || img->stoponerr))
 		{
 			ret = 0;
@@ -1019,14 +1046,17 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
         uint16 colorchannels;
 
 	stripsize = TIFFStripSize(tif);  
-	bufsize = TIFFSafeMultiply(tmsize_t,alpha?4:3,stripsize);
+	bufsize = _TIFFMultiplySSize(tif,alpha?4:3,stripsize, "gtStripSeparate");
 	if (bufsize == 0) {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in %s", "gtStripSeparate");
 		return (0);
 	}
 
 	flip = setorientation(img);
 	if (flip & FLIP_VERTICALLY) {
+		if ( w > INT_MAX ) {
+        	TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Width overflow");
+			return (0);
+		}
 		y = h - 1;
 		toskew = -(int32)(w + w);
 	}
@@ -1053,15 +1083,22 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 	fromskew = (w < imagewidth ? imagewidth - w : 0);
 	for (row = 0; row < h; row += nrow)
 	{
+                uint32 temp;
 		rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
 		nrow = (row + rowstoread > h ? h - row : rowstoread);
 		offset_row = row + img->row_offset;
+                temp = (row + img->row_offset)%rowsperstrip + nrow;
+                if( scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline) )
+                {
+                        TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in gtStripSeparate");
+                        return 0;
+                }
                 if( buf == NULL )
                 {
                     if (_TIFFReadEncodedStripAndAllocBuffer(
                             tif, TIFFComputeStrip(tif, offset_row, 0),
                             (void**) &buf, bufsize,
-                            ((row + img->row_offset)%rowsperstrip + nrow) * scanline)==(tmsize_t)(-1)
+                            temp * scanline)==(tmsize_t)(-1)
                         && (buf == NULL || img->stoponerr))
                     {
                             ret = 0;
@@ -1081,7 +1118,7 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
                     }
                 }
 		else if (TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 0),
-		    p0, ((row + img->row_offset)%rowsperstrip + nrow) * scanline)==(tmsize_t)(-1)
+		    p0, temp * scanline)==(tmsize_t)(-1)
 		    && img->stoponerr)
 		{
 			ret = 0;
@@ -1089,7 +1126,7 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 		}
 		if (colorchannels > 1 
                     && TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 1),
-                                            p1, ((row + img->row_offset)%rowsperstrip + nrow) * scanline) == (tmsize_t)(-1)
+                                            p1, temp * scanline) == (tmsize_t)(-1)
 		    && img->stoponerr)
 		{
 			ret = 0;
@@ -1097,7 +1134,7 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 		}
 		if (colorchannels > 1 
                     && TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 2),
-                                            p2, ((row + img->row_offset)%rowsperstrip + nrow) * scanline) == (tmsize_t)(-1)
+                                            p2, temp * scanline) == (tmsize_t)(-1)
 		    && img->stoponerr)
 		{
 			ret = 0;
@@ -1106,7 +1143,7 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
 		if (alpha)
 		{
 			if (TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, colorchannels),
-			    pa, ((row + img->row_offset)%rowsperstrip + nrow) * scanline)==(tmsize_t)(-1)
+			    pa, temp * scanline)==(tmsize_t)(-1)
 			    && img->stoponerr)
 			{
 				ret = 0;
@@ -2957,7 +2994,7 @@ TIFFReadRGBATileExt(TIFF* tif, uint32 col, uint32 row, uint32 * raster, int stop
     if( !TIFFIsTiled( tif ) )
     {
 		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-				  "Can't use TIFFReadRGBATile() with stripped file.");
+				  "Can't use TIFFReadRGBATile() with striped file.");
 		return (0);
     }
     
diff --git a/3rdparty/libtiff/tif_jbig.c b/3rdparty/libtiff/tif_jbig.c
index 7ffe8851e8..a3500e0b6f 100644
--- a/3rdparty/libtiff/tif_jbig.c
+++ b/3rdparty/libtiff/tif_jbig.c
@@ -199,6 +199,7 @@ static int JBIGEncode(TIFF* tif, uint8* buffer, tmsize_t size, uint16 s)
 
 int TIFFInitJBIG(TIFF* tif, int scheme)
 {
+        (void)scheme;
 	assert(scheme == COMPRESSION_JBIG);
 
 	/*
diff --git a/3rdparty/libtiff/tif_jpeg.c b/3rdparty/libtiff/tif_jpeg.c
index f2ddc331a0..6711137a92 100644
--- a/3rdparty/libtiff/tif_jpeg.c
+++ b/3rdparty/libtiff/tif_jpeg.c
@@ -466,7 +466,8 @@ std_empty_output_buffer(j_compress_ptr cinfo)
        }
 #endif
 
-	TIFFFlushData1(tif);
+	if( !TIFFFlushData1(tif) )
+            return FALSE;
 	sp->dest.next_output_byte = (JOCTET*) tif->tif_rawdata;
 	sp->dest.free_in_buffer = (size_t) tif->tif_rawdatasize;
 
@@ -780,12 +781,9 @@ JPEGFixupTagsSubsampling(TIFF* tif)
 	 */
 	static const char module[] = "JPEGFixupTagsSubsampling";
 	struct JPEGFixupTagsSubsamplingData m;
+        uint64 fileoffset = TIFFGetStrileOffset(tif, 0);
 
-        _TIFFFillStriles( tif );
-        
-        if( tif->tif_dir.td_stripbytecount == NULL
-            || tif->tif_dir.td_stripoffset == NULL
-            || tif->tif_dir.td_stripbytecount[0] == 0 )
+        if( fileoffset == 0 )
         {
             /* Do not even try to check if the first strip/tile does not
                yet exist, as occurs when GDAL has created a new NULL file
@@ -804,9 +802,9 @@ JPEGFixupTagsSubsampling(TIFF* tif)
 	}
 	m.buffercurrentbyte=NULL;
 	m.bufferbytesleft=0;
-	m.fileoffset=tif->tif_dir.td_stripoffset[0];
+	m.fileoffset=fileoffset;
 	m.filepositioned=0;
-	m.filebytesleft=tif->tif_dir.td_stripbytecount[0];
+	m.filebytesleft=TIFFGetStrileByteCount(tif, 0);
 	if (!JPEGFixupTagsSubsamplingSec(&m))
 		TIFFWarningExt(tif->tif_clientdata,module,
 		    "Unable to auto-correct subsampling values, likely corrupt JPEG compressed data in first strip/tile; auto-correcting skipped");
@@ -940,7 +938,10 @@ JPEGFixupTagsSubsamplingReadByte(struct JPEGFixupTagsSubsamplingData* data, uint
 			return(0);
 		if (!data->filepositioned)
 		{
-			TIFFSeekFile(data->tif,data->fileoffset,SEEK_SET);
+			if (TIFFSeekFile(data->tif,data->fileoffset,SEEK_SET) == (toff_t)-1)
+			{
+			    return 0;
+			}
 			data->filepositioned=1;
 		}
 		m=data->buffersize;
@@ -1209,35 +1210,37 @@ JPEGPreDecode(TIFF* tif, uint16 s)
             /* store for all coefficients */
             /* See call to jinit_d_coef_controller() from master_selection() */
             /* in libjpeg */
-            toff_t nRequiredMemory = (toff_t)sp->cinfo.d.image_width *
-                                     sp->cinfo.d.image_height *
-                                     sp->cinfo.d.num_components *
-                                     ((td->td_bitspersample+7)/8);
-            /* BLOCK_SMOOTHING_SUPPORTED is generally defined, so we need */
-            /* to replicate the logic of jinit_d_coef_controller() */
-            if( sp->cinfo.d.progressive_mode )
-                nRequiredMemory *= 3;
 
-#ifndef TIFF_LIBJPEG_LARGEST_MEM_ALLOC
-#define TIFF_LIBJPEG_LARGEST_MEM_ALLOC (100 * 1024 * 1024)
-#endif
+            /* 1 MB for regular libjpeg usage */
+            toff_t nRequiredMemory = 1024 * 1024;
 
-            if( nRequiredMemory > TIFF_LIBJPEG_LARGEST_MEM_ALLOC &&
+            for (ci = 0; ci < sp->cinfo.d.num_components; ci++) {
+                const jpeg_component_info *compptr = &(sp->cinfo.d.comp_info[ci]);
+                if( compptr->h_samp_factor > 0 && compptr->v_samp_factor > 0 )
+                {
+                    nRequiredMemory += (toff_t)(
+                        ((compptr->width_in_blocks + compptr->h_samp_factor - 1) / compptr->h_samp_factor)) *
+                        ((compptr->height_in_blocks + compptr->v_samp_factor - 1) / compptr->v_samp_factor) *
+                        sizeof(JBLOCK);
+                }
+            }
+
+            if( sp->cinfo.d.mem->max_memory_to_use > 0 &&
+                nRequiredMemory > (toff_t)(sp->cinfo.d.mem->max_memory_to_use) &&
                 getenv("LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC") == NULL )
             {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "Reading this strip would require libjpeg to allocate "
-                        "at least %u bytes. "
-                        "This is disabled since above the %u threshold. "
-                        "You may override this restriction by defining the "
-                        "LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC environment variable, "
-                        "or recompile libtiff by defining the "
-                        "TIFF_LIBJPEG_LARGEST_MEM_ALLOC macro to a value greater "
-                        "than %u",
-                        (unsigned)nRequiredMemory,
-                        (unsigned)TIFF_LIBJPEG_LARGEST_MEM_ALLOC,
-                        (unsigned)TIFF_LIBJPEG_LARGEST_MEM_ALLOC);
-                    return (0);
+                TIFFErrorExt(tif->tif_clientdata, module,
+                    "Reading this image would require libjpeg to allocate "
+                    "at least %u bytes. "
+                    "This is disabled since above the %u threshold. "
+                    "You may override this restriction by defining the "
+                    "LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC environment variable, "
+                    "or setting the JPEGMEM environment variable to a value greater "
+                    "or equal to '%uM'",
+                    (unsigned)(nRequiredMemory),
+                    (unsigned)(sp->cinfo.d.mem->max_memory_to_use),
+                    (unsigned)((nRequiredMemory + 1000000 - 1) / 1000000));
+                return 0;
             }
         }
 
@@ -1566,7 +1569,7 @@ JPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
 					JSAMPLE *outptr = (JSAMPLE*)tmpbuf + clumpoffset;
 #else
 					JSAMPLE *outptr = (JSAMPLE*)buf + clumpoffset;
-					if (cc < (tmsize_t) (clumpoffset + samples_per_clump*(clumps_per_line-1) + hsamp)) {
+					if (cc < (tmsize_t)(clumpoffset + (tmsize_t)samples_per_clump*(clumps_per_line-1) + hsamp)) {
 						TIFFErrorExt(tif->tif_clientdata, "JPEGDecodeRaw",
 							     "application buffer not large enough for all data, possible subsampling issue");
 						return 0;
@@ -2126,8 +2129,8 @@ JPEGEncodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
 	/* data is expected to be supplied in multiples of a clumpline */
 	/* a clumpline is equivalent to v_sampling desubsampled scanlines */
 	/* TODO: the following calculation of bytesperclumpline, should substitute calculation of sp->bytesperline, except that it is per v_sampling lines */
-	bytesperclumpline = (((sp->cinfo.c.image_width+sp->h_sampling-1)/sp->h_sampling)
-			     *(sp->h_sampling*sp->v_sampling+2)*sp->cinfo.c.data_precision+7)
+	bytesperclumpline = ((((tmsize_t)sp->cinfo.c.image_width+sp->h_sampling-1)/sp->h_sampling)
+			     *((tmsize_t)sp->h_sampling*sp->v_sampling+2)*sp->cinfo.c.data_precision+7)
 			    /8;
 
 	nrows = ( cc / bytesperclumpline ) * sp->v_sampling;
@@ -2347,7 +2350,7 @@ JPEGVGetField(TIFF* tif, uint32 tag, va_list ap)
 	switch (tag) {
 		case TIFFTAG_JPEGTABLES:
 			*va_arg(ap, uint32*) = sp->jpegtables_length;
-			*va_arg(ap, void**) = sp->jpegtables;
+			*va_arg(ap, const void**) = sp->jpegtables;
 			break;
 		case TIFFTAG_JPEGQUALITY:
 			*va_arg(ap, int*) = sp->jpegquality;
@@ -2482,6 +2485,7 @@ TIFFInitJPEG(TIFF* tif, int scheme)
 {
 	JPEGState* sp;
 
+        (void)scheme;
 	assert(scheme == COMPRESSION_JPEG);
 
 	/*
diff --git a/3rdparty/libtiff/tif_luv.c b/3rdparty/libtiff/tif_luv.c
index 192fa26188..3bd02e88e4 100644
--- a/3rdparty/libtiff/tif_luv.c
+++ b/3rdparty/libtiff/tif_luv.c
@@ -193,6 +193,7 @@ LogL16Decode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
 	tmsize_t cc;
 	int rc;
 
+        (void)s;
 	assert(s == 0);
 	assert(sp != NULL);
 
@@ -266,6 +267,7 @@ LogLuvDecode24(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
 	unsigned char* bp;
 	uint32* tp;
 
+        (void)s;
 	assert(s == 0);
 	assert(sp != NULL);
 
@@ -326,6 +328,7 @@ LogLuvDecode32(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
 	tmsize_t cc;
 	int rc;
 
+        (void)s;
 	assert(s == 0);
 	sp = DecoderState(tif);
 	assert(sp != NULL);
@@ -447,6 +450,7 @@ LogL16Encode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 	int rc=0, mask;
 	tmsize_t beg;
 
+        (void)s;
 	assert(s == 0);
 	assert(sp != NULL);
 	npixels = cc / sp->pixel_size;
@@ -541,6 +545,7 @@ LogLuvEncode24(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 	uint8* op;
 	uint32* tp;
 
+        (void)s;
 	assert(s == 0);
 	assert(sp != NULL);
 	npixels = cc / sp->pixel_size;
@@ -598,6 +603,7 @@ LogLuvEncode32(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 	int rc=0, mask;
 	tmsize_t beg;
 
+        (void)s;
 	assert(s == 0);
 	assert(sp != NULL);
 
@@ -742,7 +748,7 @@ LogLuvEncodeTile(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 #undef exp2  /* Conflict with C'99 function */
 #define exp2(x)		exp(M_LN2*(x))
 
-static int itrunc(double x, int m)
+static int tiff_itrunc(double x, int m)
 {
     if( m == SGILOGENCODE_NODITHER )
         return (int)x;
@@ -777,9 +783,9 @@ LogL16fromY(double Y, int em)	/* get 16-bit LogL from Y */
 	if (Y <= -1.8371976e19)
 		return (0xffff);
 	if (Y > 5.4136769e-20)
-		return itrunc(256.*(log2(Y) + 64.), em);
+		return tiff_itrunc(256.*(log2(Y) + 64.), em);
 	if (Y < -5.4136769e-20)
-		return (~0x7fff | itrunc(256.*(log2(-Y) + 64.), em));
+		return (~0x7fff | tiff_itrunc(256.*(log2(-Y) + 64.), em));
 	return (0);
 }
 
@@ -855,7 +861,7 @@ LogL10fromY(double Y, int em)	/* get 10-bit LogL from Y */
 	else if (Y <= .00024283)
 		return (0);
 	else
-		return itrunc(64.*(log2(Y) + 12.), em);
+		return tiff_itrunc(64.*(log2(Y) + 12.), em);
 }
 
 #define NANGLES		100
@@ -925,12 +931,12 @@ uv_encode(double u, double v, int em)	/* encode (u',v') coordinates */
 
 	if (v < UV_VSTART)
 		return oog_encode(u, v);
-	vi = itrunc((v - UV_VSTART)*(1./UV_SQSIZ), em);
+	vi = tiff_itrunc((v - UV_VSTART)*(1./UV_SQSIZ), em);
 	if (vi >= UV_NVS)
 		return oog_encode(u, v);
 	if (u < uv_row[vi].ustart)
 		return oog_encode(u, v);
-	ui = itrunc((u - uv_row[vi].ustart)*(1./UV_SQSIZ), em);
+	ui = tiff_itrunc((u - uv_row[vi].ustart)*(1./UV_SQSIZ), em);
 	if (ui >= uv_row[vi].nus)
 		return oog_encode(u, v);
 
@@ -1099,7 +1105,7 @@ Luv24fromLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
 		else if (sp->encode_meth == SGILOGENCODE_NODITHER)
 			Le = (luv3[0]-3314) >> 2;
 		else
-			Le = itrunc(.25*(luv3[0]-3314.), sp->encode_meth);
+			Le = tiff_itrunc(.25*(luv3[0]-3314.), sp->encode_meth);
 
 		Ce = uv_encode((luv3[1]+.5)/(1<<15), (luv3[2]+.5)/(1<<15),
 					sp->encode_meth);
@@ -1155,10 +1161,10 @@ LogLuv32fromXYZ(float XYZ[3], int em)
 		v = 9.*XYZ[1] / s;
 	}
 	if (u <= 0.) ue = 0;
-	else ue = itrunc(UVSCALE*u, em);
+	else ue = tiff_itrunc(UVSCALE*u, em);
 	if (ue > 255) ue = 255;
 	if (v <= 0.) ve = 0;
-	else ve = itrunc(UVSCALE*v, em);
+	else ve = tiff_itrunc(UVSCALE*v, em);
 	if (ve > 255) ve = 255;
 					/* combine encodings */
 	return (Le << 16 | ue << 8 | ve);
@@ -1238,8 +1244,8 @@ Luv32fromLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
 	}
 	while (n-- > 0) {
 		*luv++ = (uint32)luv3[0] << 16 |
-	(itrunc(luv3[1]*(UVSCALE/(1<<15)), sp->encode_meth) << 8 & 0xff00) |
-		(itrunc(luv3[2]*(UVSCALE/(1<<15)), sp->encode_meth) & 0xff);
+	(tiff_itrunc(luv3[1]*(UVSCALE/(1<<15)), sp->encode_meth) << 8 & 0xff00) |
+		(tiff_itrunc(luv3[2]*(UVSCALE/(1<<15)), sp->encode_meth) & 0xff);
 		luv3 += 3;
 	}
 }
@@ -1269,16 +1275,10 @@ LogL16GuessDataFmt(TIFFDirectory *td)
 	return (SGILOGDATAFMT_UNKNOWN);
 }
 
-
-#define TIFF_SIZE_T_MAX ((size_t) ~ ((size_t)0))
-#define TIFF_TMSIZE_T_MAX (tmsize_t)(TIFF_SIZE_T_MAX >> 1)
-
 static tmsize_t
 multiply_ms(tmsize_t m1, tmsize_t m2)
 {
-        if( m1 == 0 || m2 > TIFF_TMSIZE_T_MAX / m1 )
-            return 0;
-        return m1 * m2;
+        return _TIFFMultiplySSize(NULL, m1, m2, NULL);
 }
 
 static int
@@ -1512,7 +1512,7 @@ LogLuvSetupEncode(TIFF* tif)
 	switch (td->td_photometric) {
 	case PHOTOMETRIC_LOGLUV:
 		if (!LogLuvInitState(tif))
-			break;
+			return (0);
 		if (td->td_compression == COMPRESSION_SGILOG24) {
 			tif->tif_encoderow = LogLuvEncode24;
 			switch (sp->user_datafmt) {
@@ -1545,7 +1545,7 @@ LogLuvSetupEncode(TIFF* tif)
 		break;
 	case PHOTOMETRIC_LOGL:
 		if (!LogL16InitState(tif))
-			break;
+			return (0);
 		tif->tif_encoderow = LogL16Encode;  
 		switch (sp->user_datafmt) {
 		case SGILOGDATAFMT_FLOAT:
@@ -1561,7 +1561,7 @@ LogLuvSetupEncode(TIFF* tif)
 		TIFFErrorExt(tif->tif_clientdata, module,
 		    "Inappropriate photometric interpretation %d for SGILog compression; %s",
 		    td->td_photometric, "must be either LogLUV or LogL");
-		break;
+		return (0);
 	}
 	sp->encoder_state = 1;
 	return (1);
diff --git a/3rdparty/libtiff/tif_lzma.c b/3rdparty/libtiff/tif_lzma.c
index 3f6096b62a..e150bd635d 100644
--- a/3rdparty/libtiff/tif_lzma.c
+++ b/3rdparty/libtiff/tif_lzma.c
@@ -300,7 +300,8 @@ LZMAEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 		}
 		if (sp->stream.avail_out == 0) {
 			tif->tif_rawcc = tif->tif_rawdatasize;
-			TIFFFlushData1(tif);
+			if (!TIFFFlushData1(tif))
+				return 0;
 			sp->stream.next_out = tif->tif_rawdata;
 			sp->stream.avail_out = (size_t)tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in LZMAPreEncode */
 		}
@@ -328,7 +329,8 @@ LZMAPostEncode(TIFF* tif)
 			if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize) {
 				tif->tif_rawcc =
 					tif->tif_rawdatasize - sp->stream.avail_out;
-				TIFFFlushData1(tif);
+				if (!TIFFFlushData1(tif))
+					return 0;
 				sp->stream.next_out = tif->tif_rawdata;
 				sp->stream.avail_out = (size_t)tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in ZIPPreEncode */
 			}
@@ -418,6 +420,7 @@ TIFFInitLZMA(TIFF* tif, int scheme)
 	LZMAState* sp;
 	lzma_stream tmp_stream = LZMA_STREAM_INIT;
 
+        (void)scheme;
 	assert( scheme == COMPRESSION_LZMA );
 
 	/*
diff --git a/3rdparty/libtiff/tif_lzw.c b/3rdparty/libtiff/tif_lzw.c
index 21064f29ae..d92d0fd354 100644
--- a/3rdparty/libtiff/tif_lzw.c
+++ b/3rdparty/libtiff/tif_lzw.c
@@ -214,19 +214,16 @@ LZWSetupDecode(TIFF* tif)
 			return (0);
 		}
 
-		DecoderState(tif)->dec_codetab = NULL;
-		DecoderState(tif)->dec_decode = NULL;
+		sp = DecoderState(tif);
+		sp->dec_codetab = NULL;
+		sp->dec_decode = NULL;
 
 		/*
 		 * Setup predictor setup.
 		 */
 		(void) TIFFPredictorInit(tif);
-
-		sp = DecoderState(tif);
 	}
 
-	assert(sp != NULL);
-
 	if (sp->dec_codetab == NULL) {
 		sp->dec_codetab = (code_t*)_TIFFmalloc(CSIZE*sizeof (code_t));
 		if (sp->dec_codetab == NULL) {
@@ -1161,6 +1158,7 @@ int
 TIFFInitLZW(TIFF* tif, int scheme)
 {
 	static const char module[] = "TIFFInitLZW";
+        (void)scheme;
 	assert(scheme == COMPRESSION_LZW);
 	/*
 	 * Allocate state block so tag methods have storage to record values.
@@ -1218,7 +1216,7 @@ bad:
  * from this software without specific prior written permission.
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
- * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  */
 #endif /* LZW_SUPPORT */
 
diff --git a/3rdparty/libtiff/tif_ojpeg.c b/3rdparty/libtiff/tif_ojpeg.c
index 27385d8c47..133d1f1c49 100644
--- a/3rdparty/libtiff/tif_ojpeg.c
+++ b/3rdparty/libtiff/tif_ojpeg.c
@@ -74,7 +74,7 @@
    or errors, up to the point where either these values are read, or it's clear they
    aren't there. This means that some of the data is read twice, but we feel speed
    in correcting these values is important enough to warrant this sacrifice. Although
-   there is currently no define or other configuration mechanism to disable this behaviour,
+   there is currently no define or other configuration mechanism to disable this behavior,
    the actual header scanning is build to robustly respond with error report if it
    should encounter an uncorrected mismatch of subsampling values. See
    OJPEGReadHeaderInfoSecStreamSof.
@@ -243,6 +243,7 @@ typedef enum {
 typedef struct {
 	TIFF* tif;
         int decoder_ok;
+        int error_in_raw_data_decoding;
 	#ifndef LIBJPEG_ENCAP_EXTERNAL
 	JMP_BUF exit_jmpbuf;
 	#endif
@@ -420,6 +421,7 @@ TIFFInitOJPEG(TIFF* tif, int scheme)
 	static const char module[]="TIFFInitOJPEG";
 	OJPEGState* sp;
 
+        (void)scheme;
 	assert(scheme==COMPRESSION_OJPEG);
 
         /*
@@ -497,15 +499,15 @@ OJPEGVGetField(TIFF* tif, uint32 tag, va_list ap)
 			break;
 		case TIFFTAG_JPEGQTABLES:
 			*va_arg(ap,uint32*)=(uint32)sp->qtable_offset_count;
-			*va_arg(ap,void**)=(void*)sp->qtable_offset; 
+			*va_arg(ap,const void**)=(const void*)sp->qtable_offset;
 			break;
 		case TIFFTAG_JPEGDCTABLES:
 			*va_arg(ap,uint32*)=(uint32)sp->dctable_offset_count;
-			*va_arg(ap,void**)=(void*)sp->dctable_offset;  
+			*va_arg(ap,const void**)=(const void*)sp->dctable_offset;
 			break;
 		case TIFFTAG_JPEGACTABLES:
 			*va_arg(ap,uint32*)=(uint32)sp->actable_offset_count;
-			*va_arg(ap,void**)=(void*)sp->actable_offset;
+			*va_arg(ap,const void**)=(const void*)sp->actable_offset;
 			break;
 		case TIFFTAG_JPEGPROC:
 			*va_arg(ap,uint16*)=(uint16)sp->jpeg_proc;
@@ -657,7 +659,7 @@ static int
 OJPEGSetupDecode(TIFF* tif)
 {
 	static const char module[]="OJPEGSetupDecode";
-	TIFFWarningExt(tif->tif_clientdata,module,"Depreciated and troublesome old-style JPEG compression mode, please convert to new-style JPEG compression and notify vendor of writing software");
+	TIFFWarningExt(tif->tif_clientdata,module,"Deprecated and troublesome old-style JPEG compression mode, please convert to new-style JPEG compression and notify vendor of writing software");
 	return(1);
 }
 
@@ -678,7 +680,7 @@ OJPEGPreDecode(TIFF* tif, uint16 s)
 		if (OJPEGReadSecondarySos(tif,s)==0)
 			return(0);
 	}
-	if isTiled(tif)
+	if (isTiled(tif))
 		m=tif->tif_curtile;
 	else
 		m=tif->tif_curstrip;
@@ -742,6 +744,7 @@ OJPEGPreDecodeSkipRaw(TIFF* tif)
 		}
 		m-=sp->subsampling_convert_clines-sp->subsampling_convert_state;
 		sp->subsampling_convert_state=0;
+                sp->error_in_raw_data_decoding=0;
 	}
 	while (m>=sp->subsampling_convert_clines)
 	{
@@ -792,6 +795,10 @@ OJPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
             TIFFErrorExt(tif->tif_clientdata,module,"Cannot decode: decoder not correctly initialized");
             return 0;
         }
+        if( sp->error_in_raw_data_decoding )
+        {
+            return 0;
+        }
 	if (sp->libjpeg_jpeg_query_style==0)
 	{
 		if (OJPEGDecodeRaw(tif,buf,cc)==0)
@@ -832,7 +839,10 @@ OJPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc)
 		if (sp->subsampling_convert_state==0)
 		{
 			if (jpeg_read_raw_data_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),sp->subsampling_convert_ycbcrimage,sp->subsampling_ver*8)==0)
+			{
+				sp->error_in_raw_data_decoding = 1;
 				return(0);
+			}
 		}
 		oy=sp->subsampling_convert_ybuf+sp->subsampling_convert_state*sp->subsampling_ver*sp->subsampling_convert_ylinelen;
 		ocb=sp->subsampling_convert_cbbuf+sp->subsampling_convert_state*sp->subsampling_convert_clinelen;
@@ -990,7 +1000,6 @@ OJPEGSubsamplingCorrect(TIFF* tif)
 	OJPEGState* sp=(OJPEGState*)tif->tif_data;
 	uint8 mh;
 	uint8 mv;
-        _TIFFFillStriles( tif );
         
 	assert(sp->subsamplingcorrect_done==0);
 	if ((tif->tif_dir.td_samplesperpixel!=3) || ((tif->tif_dir.td_photometric!=PHOTOMETRIC_YCBCR) &&
@@ -1046,7 +1055,7 @@ OJPEGReadHeaderInfo(TIFF* tif)
 	assert(sp->readheader_done==0);
 	sp->image_width=tif->tif_dir.td_imagewidth;
 	sp->image_length=tif->tif_dir.td_imagelength;
-	if isTiled(tif)
+	if (isTiled(tif))
 	{
 		sp->strile_width=tif->tif_dir.td_tilewidth;
 		sp->strile_length=tif->tif_dir.td_tilelength;
@@ -1056,6 +1065,8 @@ OJPEGReadHeaderInfo(TIFF* tif)
 	{
 		sp->strile_width=sp->image_width;
 		sp->strile_length=tif->tif_dir.td_rowsperstrip;
+                if( sp->strile_length == (uint32)-1 )
+                    sp->strile_length = sp->image_length;
 		sp->strile_length_total=sp->image_length;
 	}
 	if (tif->tif_dir.td_samplesperpixel==1)
@@ -1082,6 +1093,12 @@ OJPEGReadHeaderInfo(TIFF* tif)
 	}
 	if (sp->strile_length<sp->image_length)
 	{
+		if (((sp->subsampling_hor!=1) && (sp->subsampling_hor!=2) && (sp->subsampling_hor!=4)) ||
+		    ((sp->subsampling_ver!=1) && (sp->subsampling_ver!=2) && (sp->subsampling_ver!=4)))
+		{
+			TIFFErrorExt(tif->tif_clientdata,module,"Invalid subsampling values");
+			return(0);
+		}
 		if (sp->strile_length%(sp->subsampling_ver*8)!=0)
 		{
 			TIFFErrorExt(tif->tif_clientdata,module,"Incompatible vertical subsampling and image strip/tile length");
@@ -1197,7 +1214,13 @@ OJPEGWriteHeaderInfo(TIFF* tif)
 			sp->subsampling_convert_ybuflen=sp->subsampling_convert_ylinelen*sp->subsampling_convert_ylines;
 			sp->subsampling_convert_cbuflen=sp->subsampling_convert_clinelen*sp->subsampling_convert_clines;
 			sp->subsampling_convert_ycbcrbuflen=sp->subsampling_convert_ybuflen+2*sp->subsampling_convert_cbuflen;
-			sp->subsampling_convert_ycbcrbuf=_TIFFmalloc(sp->subsampling_convert_ycbcrbuflen);
+                        /* The calloc is not normally necessary, except in some edge/broken cases */
+                        /* for example for a tiled image of height 1 with a tile height of 1 and subsampling_hor=subsampling_ver=2 */
+                        /* In that case, libjpeg will only fill the 8 first lines of the 16 lines */
+                        /* See https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16844 */
+                        /* Even if this case is allowed (?), its handling is broken because OJPEGPreDecode() should also likely */
+                        /* reset subsampling_convert_state to 0 when changing tile. */
+			sp->subsampling_convert_ycbcrbuf=_TIFFcalloc(1, sp->subsampling_convert_ycbcrbuflen);
 			if (sp->subsampling_convert_ycbcrbuf==0)
 			{
 				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
@@ -1223,10 +1246,11 @@ OJPEGWriteHeaderInfo(TIFF* tif)
 				*m++=sp->subsampling_convert_cbbuf+n*sp->subsampling_convert_clinelen;
 			for (n=0; n<sp->subsampling_convert_clines; n++)
 				*m++=sp->subsampling_convert_crbuf+n*sp->subsampling_convert_clinelen;
-			sp->subsampling_convert_clinelenout=((sp->strile_width+sp->subsampling_hor-1)/sp->subsampling_hor);
+			sp->subsampling_convert_clinelenout=sp->strile_width/sp->subsampling_hor + ((sp->strile_width % sp->subsampling_hor) != 0 ? 1 : 0);
 			sp->subsampling_convert_state=0;
+			sp->error_in_raw_data_decoding=0;
 			sp->bytes_per_line=sp->subsampling_convert_clinelenout*(sp->subsampling_ver*sp->subsampling_hor+2);
-			sp->lines_per_strile=((sp->strile_length+sp->subsampling_ver-1)/sp->subsampling_ver);
+			sp->lines_per_strile=sp->strile_length/sp->subsampling_ver + ((sp->strile_length % sp->subsampling_ver) != 0 ? 1 : 0);
 			sp->subsampling_convert_log=1;
 		}
 	}
@@ -1240,6 +1264,26 @@ OJPEGWriteHeaderInfo(TIFF* tif)
 	}
 	if (jpeg_start_decompress_encap(sp,&(sp->libjpeg_jpeg_decompress_struct))==0)
 		return(0);
+        if(sp->libjpeg_jpeg_decompress_struct.image_width != sp->strile_width ) {
+            TIFFErrorExt(tif->tif_clientdata,module,
+                         "jpeg_start_decompress() returned image_width = %d, "
+                         "expected %d",
+                         sp->libjpeg_jpeg_decompress_struct.image_width,
+                         sp->strile_width);
+            return 0;
+        }
+        if(sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor != sp->subsampling_hor ||
+           sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor != sp->subsampling_ver) {
+            TIFFErrorExt(tif->tif_clientdata,module,
+                         "jpeg_start_decompress() returned max_h_samp_factor = %d "
+                         "and max_v_samp_factor = %d, expected %d and %d",
+                         sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor,
+                         sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor,
+                         sp->subsampling_hor,
+                         sp->subsampling_ver);
+            return 0;
+        }
+
 	sp->writeheader_done=1;
 	return(1);
 }
@@ -1272,7 +1316,9 @@ OJPEGReadHeaderInfoSec(TIFF* tif)
 		}
 		else
 		{
-			if ((sp->jpeg_interchange_format_length==0) || (sp->jpeg_interchange_format+sp->jpeg_interchange_format_length>sp->file_size))
+			if ((sp->jpeg_interchange_format_length==0) ||
+                            (sp->jpeg_interchange_format > TIFF_UINT64_MAX - sp->jpeg_interchange_format_length) ||
+                            (sp->jpeg_interchange_format+sp->jpeg_interchange_format_length>sp->file_size))
 				sp->jpeg_interchange_format_length=sp->file_size-sp->jpeg_interchange_format;
 		}
 	}
@@ -1989,32 +2035,30 @@ OJPEGReadBufferFill(OJPEGState* sp)
 				sp->in_buffer_source=osibsStrile;
                                 break;
 			case osibsStrile:
-				if (!_TIFFFillStriles( sp->tif ) 
-				    || sp->tif->tif_dir.td_stripoffset == NULL
-				    || sp->tif->tif_dir.td_stripbytecount == NULL)
-					return 0;
-
 				if (sp->in_buffer_next_strile==sp->in_buffer_strile_count)
 					sp->in_buffer_source=osibsEof;
 				else
 				{
-					sp->in_buffer_file_pos=sp->tif->tif_dir.td_stripoffset[sp->in_buffer_next_strile];
+					int err = 0;
+					sp->in_buffer_file_pos=TIFFGetStrileOffsetWithErr(sp->tif, sp->in_buffer_next_strile, &err);
+					if( err )
+						return 0;
 					if (sp->in_buffer_file_pos!=0)
 					{
+						uint64 bytecount = TIFFGetStrileByteCountWithErr(sp->tif, sp->in_buffer_next_strile, &err);
+						if( err )
+							return 0;
 						if (sp->in_buffer_file_pos>=sp->file_size)
 							sp->in_buffer_file_pos=0;
-						else if (sp->tif->tif_dir.td_stripbytecount==NULL)
+						else if (bytecount==0)
 							sp->in_buffer_file_togo=sp->file_size-sp->in_buffer_file_pos;
 						else
 						{
-							if (sp->tif->tif_dir.td_stripbytecount == 0) {
-								TIFFErrorExt(sp->tif->tif_clientdata,sp->tif->tif_name,"Strip byte counts are missing");
-								return(0);
-							}
-							sp->in_buffer_file_togo=sp->tif->tif_dir.td_stripbytecount[sp->in_buffer_next_strile];
+							sp->in_buffer_file_togo=bytecount;
 							if (sp->in_buffer_file_togo==0)
 								sp->in_buffer_file_pos=0;
-							else if (sp->in_buffer_file_pos+sp->in_buffer_file_togo>sp->file_size)
+							else if (sp->in_buffer_file_pos > TIFF_UINT64_MAX - sp->in_buffer_file_togo || 
+                                                                sp->in_buffer_file_pos+sp->in_buffer_file_togo>sp->file_size)
 								sp->in_buffer_file_togo=sp->file_size-sp->in_buffer_file_pos;
 						}
 					}
diff --git a/3rdparty/libtiff/tif_open.c b/3rdparty/libtiff/tif_open.c
index c574c452aa..a0e31583a6 100644
--- a/3rdparty/libtiff/tif_open.c
+++ b/3rdparty/libtiff/tif_open.c
@@ -104,6 +104,7 @@ TIFFClientOpen(
 		} n;
 		n.a8[0]=1;
 		n.a8[1]=0;
+                (void)n;
 		#ifdef WORDS_BIGENDIAN
 		assert(n.a16==256);
 		#else
@@ -131,6 +132,7 @@ TIFFClientOpen(
 	if (!readproc || !writeproc || !seekproc || !closeproc || !sizeproc) {
 		TIFFErrorExt(clientdata, module,
 		    "One of the client procedures is NULL pointer.");
+		_TIFFfree(tif);
 		goto bad2;
 	}
 	tif->tif_readproc = readproc;
@@ -164,7 +166,7 @@ TIFFClientOpen(
 	/*
 	 * Process library-specific flags in the open mode string.
 	 * The following flags may be used to control intrinsic library
-	 * behaviour that may or may not be desirable (usually for
+	 * behavior that may or may not be desirable (usually for
 	 * compatibility with some application that claims to support
 	 * TIFF but only supports some brain dead idea of what the
 	 * vendor thinks TIFF is):
@@ -181,6 +183,8 @@ TIFFClientOpen(
 	 * 'h' read TIFF header only, do not load the first IFD
 	 * '4' ClassicTIFF for creating a file (default)
 	 * '8' BigTIFF for creating a file
+         * 'D' enable use of deferred strip/tile offset/bytecount array loading.
+         * 'O' on-demand loading of values instead of whole array loading (implies D)
 	 *
 	 * The use of the 'l' and 'b' flags is strongly discouraged.
 	 * These flags are provided solely because numerous vendors,
@@ -203,7 +207,7 @@ TIFFClientOpen(
 	 * not do right now.
 	 *
 	 * The 'M' and 'm' flags are provided because some virtual memory
-	 * systems exhibit poor behaviour when large images are mapped.
+	 * systems exhibit poor behavior when large images are mapped.
 	 * These options permit clients to control the use of memory-mapped
 	 * files on a per-file basis.
 	 *
@@ -262,7 +266,22 @@ TIFFClientOpen(
 				if (m&O_CREAT)
 					tif->tif_flags |= TIFF_BIGTIFF;
 				break;
+			case 'D':
+			        tif->tif_flags |= TIFF_DEFERSTRILELOAD;
+				break;
+			case 'O':
+				if( m == O_RDONLY )
+					tif->tif_flags |= (TIFF_LAZYSTRILELOAD | TIFF_DEFERSTRILELOAD);
+				break;
 		}
+
+#ifdef DEFER_STRILE_LOAD
+        /* Compatibility with old DEFER_STRILE_LOAD compilation flag */
+        /* Probably unneeded, since to the best of my knowledge (E. Rouault) */
+        /* GDAL was the only user of this, and will now use the new 'D' flag */
+        tif->tif_flags |= TIFF_DEFERSTRILELOAD;
+#endif
+
 	/*
 	 * Read in TIFF header.
 	 */
diff --git a/3rdparty/libtiff/tif_pixarlog.c b/3rdparty/libtiff/tif_pixarlog.c
index b1e48d99c9..f291201505 100644
--- a/3rdparty/libtiff/tif_pixarlog.c
+++ b/3rdparty/libtiff/tif_pixarlog.c
@@ -634,16 +634,10 @@ PixarLogGuessDataFmt(TIFFDirectory *td)
 	return guess;
 }
 
-#define TIFF_SIZE_T_MAX ((size_t) ~ ((size_t)0))
-#define TIFF_TMSIZE_T_MAX (tmsize_t)(TIFF_SIZE_T_MAX >> 1)
-
 static tmsize_t
 multiply_ms(tmsize_t m1, tmsize_t m2)
 {
-        assert(m1 >= 0 && m2 >= 0);
-        if( m1 == 0 || m2 > TIFF_TMSIZE_T_MAX / m1 )
-            return 0;
-        return m1 * m2;
+        return _TIFFMultiplySSize(NULL, m1, m2, NULL);
 }
 
 static tmsize_t
@@ -1153,7 +1147,7 @@ PixarLogEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 
 	llen = sp->stride * td->td_imagewidth;
     /* Check against the number of elements (of size uint16) of sp->tbuf */
-    if( n > (tmsize_t)(td->td_rowsperstrip * llen) )
+    if( n > ((tmsize_t)td->td_rowsperstrip * llen) )
     {
         TIFFErrorExt(tif->tif_clientdata, module,
                      "Too many input bytes provided");
@@ -1206,7 +1200,8 @@ PixarLogEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 		}
 		if (sp->stream.avail_out == 0) {
 			tif->tif_rawcc = tif->tif_rawdatasize;
-			TIFFFlushData1(tif);
+			if (!TIFFFlushData1(tif))
+				return 0;
 			sp->stream.next_out = tif->tif_rawdata;
 			sp->stream.avail_out = (uInt) tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in PixarLogPreEncode */
 		}
@@ -1236,7 +1231,8 @@ PixarLogPostEncode(TIFF* tif)
 		    if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize) {
 			    tif->tif_rawcc =
 				tif->tif_rawdatasize - sp->stream.avail_out;
-			    TIFFFlushData1(tif);
+			    if (!TIFFFlushData1(tif))
+                                return 0;
 			    sp->stream.next_out = tif->tif_rawdata;
 			    sp->stream.avail_out = (uInt) tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in PixarLogPreEncode */
 		    }
@@ -1404,6 +1400,7 @@ TIFFInitPixarLog(TIFF* tif, int scheme)
 
 	PixarLogState* sp;
 
+        (void)scheme;
 	assert(scheme == COMPRESSION_PIXARLOG);
 
 	/*
diff --git a/3rdparty/libtiff/tif_predict.c b/3rdparty/libtiff/tif_predict.c
index b775663a7b..c023397459 100644
--- a/3rdparty/libtiff/tif_predict.c
+++ b/3rdparty/libtiff/tif_predict.c
@@ -116,7 +116,7 @@ PredictorSetupDecode(TIFF* tif)
 	TIFFDirectory* td = &tif->tif_dir;
 
 	/* Note: when PredictorSetup() fails, the effets of setupdecode() */
-	/* will not be "cancelled" so setupdecode() might be robust to */
+	/* will not be "canceled" so setupdecode() might be robust to */
 	/* be called several times. */
 	if (!(*sp->setupdecode)(tif) || !PredictorSetup(tif))
 		return 0;
@@ -270,8 +270,8 @@ PredictorSetupEncode(TIFF* tif)
     }
 
 /* Remarks related to C standard compliance in all below functions : */
-/* - to avoid any undefined behaviour, we only operate on unsigned types */
-/*   since the behaviour of "overflows" is defined (wrap over) */
+/* - to avoid any undefined behavior, we only operate on unsigned types */
+/*   since the behavior of "overflows" is defined (wrap over) */
 /* - when storing into the byte stream, we explicitly mask with 0xff so */
 /*   as to make icc -check=conversions happy (not necessary by the standard) */
 
diff --git a/3rdparty/libtiff/tif_print.c b/3rdparty/libtiff/tif_print.c
index 1d86adbf05..a0737941f4 100644
--- a/3rdparty/libtiff/tif_print.c
+++ b/3rdparty/libtiff/tif_print.c
@@ -652,8 +652,6 @@ TIFFPrintDirectory(TIFF* tif, FILE* fd, long flags)
 	if (tif->tif_tagmethods.printdir)
 		(*tif->tif_tagmethods.printdir)(tif, fd, flags);
 
-        _TIFFFillStriles( tif );
-        
 	if ((flags & TIFFPRINT_STRIPS) &&
 	    TIFFFieldSet(tif,FIELD_STRIPOFFSETS)) {
 		uint32 s;
@@ -665,13 +663,13 @@ TIFFPrintDirectory(TIFF* tif, FILE* fd, long flags)
 #if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
 			fprintf(fd, "    %3lu: [%8I64u, %8I64u]\n",
 			    (unsigned long) s,
-			    td->td_stripoffset ? (unsigned __int64) td->td_stripoffset[s] : 0,
-			    td->td_stripbytecount ? (unsigned __int64) td->td_stripbytecount[s] : 0);
+			    (unsigned __int64) TIFFGetStrileOffset(tif, s),
+			    (unsigned __int64) TIFFGetStrileByteCount(tif, s));
 #else
 			fprintf(fd, "    %3lu: [%8llu, %8llu]\n",
 			    (unsigned long) s,
-			    td->td_stripoffset ? (unsigned long long) td->td_stripoffset[s] : 0,
-			    td->td_stripbytecount ? (unsigned long long) td->td_stripbytecount[s] : 0);
+			    (unsigned long long) TIFFGetStrileOffset(tif, s),
+			    (unsigned long long) TIFFGetStrileByteCount(tif, s));
 #endif
 	}
 }
diff --git a/3rdparty/libtiff/tif_read.c b/3rdparty/libtiff/tif_read.c
index 79c470cbf1..c4c868b1c5 100644
--- a/3rdparty/libtiff/tif_read.c
+++ b/3rdparty/libtiff/tif_read.c
@@ -29,9 +29,6 @@
 #include "tiffiop.h"
 #include <stdio.h>
 
-#define TIFF_SIZE_T_MAX ((size_t) ~ ((size_t)0))
-#define TIFF_TMSIZE_T_MAX (tmsize_t)(TIFF_SIZE_T_MAX >> 1)
-
 int TIFFFillStrip(TIFF* tif, uint32 strip);
 int TIFFFillTile(TIFF* tif, uint32 tile);
 static int TIFFStartStrip(TIFF* tif, uint32 strip);
@@ -49,6 +46,8 @@ TIFFReadRawTile1(TIFF* tif, uint32 tile, void* buf, tmsize_t size, const char* m
 #define THRESHOLD_MULTIPLIER 10
 #define MAX_THRESHOLD (THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * INITIAL_THRESHOLD)
 
+#define TIFF_INT64_MAX ((((int64)0x7FFFFFFF) << 32) | 0xFFFFFFFF)
+
 /* Read 'size' bytes in tif_rawdata buffer starting at offset 'rawdata_offset'
  * Returns 1 in case of success, 0 otherwise. */
 static int TIFFReadAndRealloc( TIFF* tif, tmsize_t size,
@@ -61,6 +60,22 @@ static int TIFFReadAndRealloc( TIFF* tif, tmsize_t size,
 #endif
         tmsize_t already_read = 0;
 
+
+#if SIZEOF_SIZE_T != 8
+        /* On 32 bit processes, if the request is large enough, check against */
+        /* file size */
+        if( size > 1000 * 1000 * 1000 )
+        {
+            uint64 filesize = TIFFGetFileSize(tif);
+            if( (uint64)size >= filesize )
+            {
+                TIFFErrorExt(tif->tif_clientdata, module,
+                             "Chunk size requested is larger than file size.");
+                return 0;
+            }
+        }
+#endif
+
         /* On 64 bit processes, read first a maximum of 1 MB, then 10 MB, etc */
         /* so as to avoid allocating too much memory in case the file is too */
         /* short. We could ask for the file size, but this might be */
@@ -175,17 +190,14 @@ TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
         tmsize_t to_read;
         tmsize_t read_ahead_mod;
         /* tmsize_t bytecountm; */
-        
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-            return 0;
-        
+
         /*
          * Expand raw data buffer, if needed, to hold data
          * strip coming from file (perhaps should set upper
          * bound on the size of a buffer we'll use?).
          */
 
-        /* bytecountm=(tmsize_t) td->td_stripbytecount[strip]; */
+        /* bytecountm=(tmsize_t) TIFFGetStrileByteCount(tif, strip); */
 
         /* Not completely sure where the * 2 comes from, but probably for */
         /* an exponentional growth strategy of tif_rawdatasize */
@@ -229,7 +241,7 @@ TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
         /*
         ** Seek to the point in the file where more data should be read.
         */
-        read_offset = td->td_stripoffset[strip]
+        read_offset = TIFFGetStrileOffset(tif, strip)
                 + tif->tif_rawdataoff + tif->tif_rawdataloaded;
 
         if (!SeekOK(tif, read_offset)) {
@@ -246,10 +258,10 @@ TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
                 to_read = read_ahead_mod - unused_data;
         else
                 to_read = tif->tif_rawdatasize - unused_data;
-        if( (uint64) to_read > td->td_stripbytecount[strip] 
+        if( (uint64) to_read > TIFFGetStrileByteCount(tif, strip)
             - tif->tif_rawdataoff - tif->tif_rawdataloaded )
         {
-                to_read = (tmsize_t) td->td_stripbytecount[strip]
+                to_read = (tmsize_t) TIFFGetStrileByteCount(tif, strip)
                         - tif->tif_rawdataoff - tif->tif_rawdataloaded;
         }
 
@@ -288,7 +300,7 @@ TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
             /* For JPEG, if there are multiple scans (can generally be known */
             /* with the  read_ahead used), we need to read the whole strip */
             if( tif->tif_dir.td_compression==COMPRESSION_JPEG &&
-                (uint64)tif->tif_rawcc < td->td_stripbytecount[strip] )
+                (uint64)tif->tif_rawcc < TIFFGetStrileByteCount(tif, strip) )
             {
                 if( TIFFJPEGIsFullStripRequired(tif) )
                 {
@@ -347,9 +359,7 @@ TIFFSeek(TIFF* tif, uint32 row, uint16 sample )
          * read it a few lines at a time?
          */
 #if defined(CHUNKY_STRIP_READ_SUPPORT)
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-            return 0;
-        whole_strip = tif->tif_dir.td_stripbytecount[strip] < 10
+        whole_strip = TIFFGetStrileByteCount(tif, strip) < 10
                 || isMapped(tif);
         if( td->td_compression == COMPRESSION_LERC ||
             td->td_compression == COMPRESSION_JBIG )
@@ -402,7 +412,7 @@ TIFFSeek(TIFF* tif, uint32 row, uint16 sample )
         else if( !whole_strip )
         {
                 if( ((tif->tif_rawdata + tif->tif_rawdataloaded) - tif->tif_rawcp) < read_ahead 
-                    && (uint64) tif->tif_rawdataoff+tif->tif_rawdataloaded < td->td_stripbytecount[strip] )
+                    && (uint64) tif->tif_rawdataoff+tif->tif_rawdataloaded < TIFFGetStrileByteCount(tif, strip) )
                 {
                         if( !TIFFFillStripPartial(tif,strip,read_ahead,0) )
                                 return 0;
@@ -599,16 +609,11 @@ static tmsize_t
 TIFFReadRawStrip1(TIFF* tif, uint32 strip, void* buf, tmsize_t size,
     const char* module)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-
-    if (!_TIFFFillStriles( tif ))
-        return ((tmsize_t)(-1));
-        
 	assert((tif->tif_flags&TIFF_NOREADRAW)==0);
 	if (!isMapped(tif)) {
 		tmsize_t cc;
 
-		if (!SeekOK(tif, td->td_stripoffset[strip])) {
+		if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip))) {
 			TIFFErrorExt(tif->tif_clientdata, module,
 			    "Seek error at scanline %lu, strip %lu",
 			    (unsigned long) tif->tif_row, (unsigned long) strip);
@@ -634,8 +639,8 @@ TIFFReadRawStrip1(TIFF* tif, uint32 strip, void* buf, tmsize_t size,
 	} else {
 		tmsize_t ma = 0;
 		tmsize_t n;
-		if ((td->td_stripoffset[strip] > (uint64)TIFF_TMSIZE_T_MAX)||
-                    ((ma=(tmsize_t)td->td_stripoffset[strip])>tif->tif_size))
+		if ((TIFFGetStrileOffset(tif, strip) > (uint64)TIFF_TMSIZE_T_MAX)||
+                    ((ma=(tmsize_t)TIFFGetStrileOffset(tif, strip))>tif->tif_size))
                 {
                     n=0;
                 }
@@ -679,12 +684,10 @@ static tmsize_t
 TIFFReadRawStripOrTile2(TIFF* tif, uint32 strip_or_tile, int is_strip,
                         tmsize_t size, const char* module)
 {
-        TIFFDirectory *td = &tif->tif_dir;
-
         assert( !isMapped(tif) );
         assert((tif->tif_flags&TIFF_NOREADRAW)==0);
 
-        if (!SeekOK(tif, td->td_stripoffset[strip_or_tile])) {
+        if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip_or_tile))) {
             if( is_strip )
             {
                 TIFFErrorExt(tif->tif_clientdata, module,
@@ -720,7 +723,7 @@ TIFFReadRawStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size)
 {
 	static const char module[] = "TIFFReadRawStrip";
 	TIFFDirectory *td = &tif->tif_dir;
-	uint64 bytecount;
+	uint64 bytecount64;
 	tmsize_t bytecountm;
 
 	if (!TIFFCheckRead(tif, 0))
@@ -738,31 +741,23 @@ TIFFReadRawStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size)
 		    "Compression scheme does not support access to raw uncompressed data");
 		return ((tmsize_t)(-1));
 	}
-	bytecount = td->td_stripbytecount[strip];
-	if ((int64)bytecount <= 0) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%I64u: Invalid strip byte count, strip %lu",
-			     (unsigned __int64) bytecount,
-			     (unsigned long) strip);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%llu: Invalid strip byte count, strip %lu",
-			     (unsigned long long) bytecount,
-			     (unsigned long) strip);
-#endif
-		return ((tmsize_t)(-1));
-	}
-	bytecountm = (tmsize_t)bytecount;
-	if ((uint64)bytecountm!=bytecount) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Integer overflow");
-		return ((tmsize_t)(-1));
-	}
-	if (size != (tmsize_t)(-1) && size < bytecountm)
+	bytecount64 = TIFFGetStrileByteCount(tif, strip);
+	if (size != (tmsize_t)(-1) && (uint64)size <= bytecount64)
 		bytecountm = size;
+	else
+		bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
+	if( bytecountm == 0 ) {
+		return ((tmsize_t)(-1));
+	}
 	return (TIFFReadRawStrip1(tif, strip, buf, bytecountm, module));
 }
 
+TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static uint64 NoSanitizeSubUInt64(uint64 a, uint64 b)
+{
+    return a - b;
+}
+
 /*
  * Read the specified strip and setup for decoding. The data buffer is
  * expanded, as necessary, to hold the strip's data.
@@ -773,13 +768,10 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 	static const char module[] = "TIFFFillStrip";
 	TIFFDirectory *td = &tif->tif_dir;
 
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-            return 0;
-
 	if ((tif->tif_flags&TIFF_NOREADRAW)==0)
 	{
-		uint64 bytecount = td->td_stripbytecount[strip];
-		if ((int64)bytecount <= 0) {
+		uint64 bytecount = TIFFGetStrileByteCount(tif, strip);
+		if( bytecount == 0 || bytecount > (uint64)TIFF_INT64_MAX ) {
 #if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
 			TIFFErrorExt(tif->tif_clientdata, module,
 				"Invalid strip byte count %I64u, strip %lu",
@@ -806,7 +798,7 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 			    (bytecount - 4096) / 10 > (uint64)stripsize  )
 			{
 				uint64 newbytecount = (uint64)stripsize * 10 + 4096;
-				if( (int64)newbytecount >= 0 )
+				if( newbytecount == 0 || newbytecount > (uint64)TIFF_INT64_MAX )
 				{
 #if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
 					TIFFWarningExt(tif->tif_clientdata, module,
@@ -831,13 +823,13 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 			 * We must check for overflow, potentially causing
 			 * an OOB read. Instead of simple
 			 *
-			 *  td->td_stripoffset[strip]+bytecount > tif->tif_size
+			 *  TIFFGetStrileOffset(tif, strip)+bytecount > tif->tif_size
 			 *
 			 * comparison (which can overflow) we do the following
 			 * two comparisons:
 			 */
 			if (bytecount > (uint64)tif->tif_size ||
-			    td->td_stripoffset[strip] > (uint64)tif->tif_size - bytecount) {
+			    TIFFGetStrileOffset(tif, strip) > (uint64)tif->tif_size - bytecount) {
 				/*
 				 * This error message might seem strange, but
 				 * it's what would happen if a read were done
@@ -849,7 +841,7 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 					"Read error on strip %lu; "
 					"got %I64u bytes, expected %I64u",
 					(unsigned long) strip,
-					(unsigned __int64) tif->tif_size - td->td_stripoffset[strip],
+					(unsigned __int64) NoSanitizeSubUInt64(tif->tif_size, TIFFGetStrileOffset(tif, strip)),
 					(unsigned __int64) bytecount);
 #else
 				TIFFErrorExt(tif->tif_clientdata, module,
@@ -857,7 +849,7 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 					"Read error on strip %lu; "
 					"got %llu bytes, expected %llu",
 					(unsigned long) strip,
-					(unsigned long long) tif->tif_size - td->td_stripoffset[strip],
+					(unsigned long long) NoSanitizeSubUInt64(tif->tif_size, TIFFGetStrileOffset(tif, strip)),
 					(unsigned long long) bytecount);
 #endif
 				tif->tif_curstrip = NOSTRIP;
@@ -886,7 +878,7 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
 			}
 			tif->tif_flags &= ~TIFF_MYBUFFER;
 			tif->tif_rawdatasize = (tmsize_t)bytecount;
-			tif->tif_rawdata = tif->tif_base + (tmsize_t)td->td_stripoffset[strip];
+			tif->tif_rawdata = tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, strip);
                         tif->tif_rawdataoff = 0;
                         tif->tif_rawdataloaded = (tmsize_t) bytecount;
 
@@ -1101,16 +1093,11 @@ _TIFFReadEncodedTileAndAllocBuffer(TIFF* tif, uint32 tile,
 static tmsize_t
 TIFFReadRawTile1(TIFF* tif, uint32 tile, void* buf, tmsize_t size, const char* module)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-
-    if (!_TIFFFillStriles( tif ))
-        return ((tmsize_t)(-1));
-
 	assert((tif->tif_flags&TIFF_NOREADRAW)==0);
 	if (!isMapped(tif)) {
 		tmsize_t cc;
 
-		if (!SeekOK(tif, td->td_stripoffset[tile])) {
+		if (!SeekOK(tif, TIFFGetStrileOffset(tif, tile))) {
 			TIFFErrorExt(tif->tif_clientdata, module,
 			    "Seek error at row %lu, col %lu, tile %lu",
 			    (unsigned long) tif->tif_row,
@@ -1140,9 +1127,9 @@ TIFFReadRawTile1(TIFF* tif, uint32 tile, void* buf, tmsize_t size, const char* m
 	} else {
 		tmsize_t ma,mb;
 		tmsize_t n;
-		ma=(tmsize_t)td->td_stripoffset[tile];
+		ma=(tmsize_t)TIFFGetStrileOffset(tif, tile);
 		mb=ma+size;
-		if ((td->td_stripoffset[tile] > (uint64)TIFF_TMSIZE_T_MAX)||(ma>tif->tif_size))
+		if ((TIFFGetStrileOffset(tif, tile) > (uint64)TIFF_TMSIZE_T_MAX)||(ma>tif->tif_size))
 			n=0;
 		else if ((mb<ma)||(mb<size)||(mb>tif->tif_size))
 			n=tif->tif_size-ma;
@@ -1198,13 +1185,12 @@ TIFFReadRawTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size)
 		"Compression scheme does not support access to raw uncompressed data");
 		return ((tmsize_t)(-1));
 	}
-	bytecount64 = td->td_stripbytecount[tile];
-	if (size != (tmsize_t)(-1) && (uint64)size < bytecount64)
-		bytecount64 = (uint64)size;
-	bytecountm = (tmsize_t)bytecount64;
-	if ((uint64)bytecountm!=bytecount64)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
+	bytecount64 = TIFFGetStrileByteCount(tif, tile);
+	if (size != (tmsize_t)(-1) && (uint64)size <= bytecount64)
+		bytecountm = size;
+	else
+		bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
+	if( bytecountm == 0 ) {
 		return ((tmsize_t)(-1));
 	}
 	return (TIFFReadRawTile1(tif, tile, buf, bytecountm, module));
@@ -1220,13 +1206,10 @@ TIFFFillTile(TIFF* tif, uint32 tile)
 	static const char module[] = "TIFFFillTile";
 	TIFFDirectory *td = &tif->tif_dir;
 
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-            return 0;
-
 	if ((tif->tif_flags&TIFF_NOREADRAW)==0)
 	{
-		uint64 bytecount = td->td_stripbytecount[tile];
-		if ((int64)bytecount <= 0) {
+		uint64 bytecount = TIFFGetStrileByteCount(tif, tile);
+		if( bytecount == 0 || bytecount > (uint64)TIFF_INT64_MAX ) {
 #if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
 			TIFFErrorExt(tif->tif_clientdata, module,
 				"%I64u: Invalid tile byte count, tile %lu",
@@ -1253,7 +1236,7 @@ TIFFFillTile(TIFF* tif, uint32 tile)
 			    (bytecount - 4096) / 10 > (uint64)stripsize  )
 			{
 				uint64 newbytecount = (uint64)stripsize * 10 + 4096;
-				if( (int64)newbytecount >= 0 )
+				if( newbytecount == 0 || newbytecount > (uint64)TIFF_INT64_MAX )
 				{
 #if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
 					TIFFWarningExt(tif->tif_clientdata, module,
@@ -1278,13 +1261,13 @@ TIFFFillTile(TIFF* tif, uint32 tile)
 			 * We must check for overflow, potentially causing
 			 * an OOB read. Instead of simple
 			 *
-			 *  td->td_stripoffset[tile]+bytecount > tif->tif_size
+			 *  TIFFGetStrileOffset(tif, tile)+bytecount > tif->tif_size
 			 *
 			 * comparison (which can overflow) we do the following
 			 * two comparisons:
 			 */
 			if (bytecount > (uint64)tif->tif_size ||
-			    td->td_stripoffset[tile] > (uint64)tif->tif_size - bytecount) {
+			    TIFFGetStrileOffset(tif, tile) > (uint64)tif->tif_size - bytecount) {
 				tif->tif_curtile = NOTILE;
 				return (0);
 			}
@@ -1313,7 +1296,7 @@ TIFFFillTile(TIFF* tif, uint32 tile)
 
 			tif->tif_rawdatasize = (tmsize_t)bytecount;
 			tif->tif_rawdata =
-				tif->tif_base + (tmsize_t)td->td_stripoffset[tile];
+				tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, tile);
                         tif->tif_rawdataoff = 0;
                         tif->tif_rawdataloaded = (tmsize_t) bytecount;
 			tif->tif_flags |= TIFF_BUFFERMMAP;
@@ -1440,9 +1423,6 @@ TIFFStartStrip(TIFF* tif, uint32 strip)
 {
 	TIFFDirectory *td = &tif->tif_dir;
 
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-            return 0;
-
 	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
 		if (!(*tif->tif_setupdecode)(tif))
 			return (0);
@@ -1463,10 +1443,18 @@ TIFFStartStrip(TIFF* tif, uint32 strip)
 		if( tif->tif_rawdataloaded > 0 )
 			tif->tif_rawcc = tif->tif_rawdataloaded;
 		else
-			tif->tif_rawcc = (tmsize_t)td->td_stripbytecount[strip];
+			tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, strip);
 	}
-	return ((*tif->tif_predecode)(tif,
-			(uint16)(strip / td->td_stripsperimage)));
+	if ((*tif->tif_predecode)(tif,
+			(uint16)(strip / td->td_stripsperimage)) == 0 ) {
+            /* Needed for example for scanline access, if tif_predecode */
+            /* fails, and we try to read the same strip again. Without invalidating */
+            /* tif_curstrip, we'd call tif_decoderow() on a possibly invalid */
+            /* codec state. */
+            tif->tif_curstrip = NOSTRIP;
+            return 0;
+        }
+        return 1;
 }
 
 /*
@@ -1480,9 +1468,6 @@ TIFFStartTile(TIFF* tif, uint32 tile)
 	TIFFDirectory *td = &tif->tif_dir;
         uint32 howmany32;
 
-        if (!_TIFFFillStriles( tif ) || !tif->tif_dir.td_stripbytecount)
-                return 0;
-
 	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
 		if (!(*tif->tif_setupdecode)(tif))
 			return (0);
@@ -1513,7 +1498,7 @@ TIFFStartTile(TIFF* tif, uint32 tile)
 		if( tif->tif_rawdataloaded > 0 )
 			tif->tif_rawcc = tif->tif_rawdataloaded;
 		else
-			tif->tif_rawcc = (tmsize_t)td->td_stripbytecount[tile];
+			tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, tile);
 	}
 	return ((*tif->tif_predecode)(tif,
 			(uint16)(tile/td->td_stripsperimage)));
@@ -1528,13 +1513,100 @@ TIFFCheckRead(TIFF* tif, int tiles)
 	}
 	if (tiles ^ isTiled(tif)) {
 		TIFFErrorExt(tif->tif_clientdata, tif->tif_name, tiles ?
-		    "Can not read tiles from a stripped image" :
+		    "Can not read tiles from a striped image" :
 		    "Can not read scanlines from a tiled image");
 		return (0);
 	}
 	return (1);
 }
 
+/* Use the provided input buffer (inbuf, insize) and decompress it into
+ * (outbuf, outsize).
+ * This function replaces the use of TIFFReadEncodedStrip()/TIFFReadEncodedTile()
+ * when the user can provide the buffer for the input data, for example when
+ * he wants to avoid libtiff to read the strile offset/count values from the
+ * [Strip|Tile][Offsets/ByteCounts] array.
+ * inbuf content must be writable (if bit reversal is needed)
+ * Returns 1 in case of success, 0 otherwise.
+ */
+int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
+                                void* inbuf, tmsize_t insize,
+                                void* outbuf, tmsize_t outsize)
+{
+    static const char module[] = "TIFFReadFromUserBuffer";
+    TIFFDirectory *td = &tif->tif_dir;
+    int ret = 1;
+    uint32 old_tif_flags = tif->tif_flags;
+    tmsize_t old_rawdatasize = tif->tif_rawdatasize;
+    void* old_rawdata = tif->tif_rawdata;
+
+    if (tif->tif_mode == O_WRONLY) {
+        TIFFErrorExt(tif->tif_clientdata, tif->tif_name, "File not open for reading");
+        return 0;
+    }
+    if (tif->tif_flags&TIFF_NOREADRAW)
+    {
+        TIFFErrorExt(tif->tif_clientdata, module,
+                "Compression scheme does not support access to raw uncompressed data");
+        return 0;
+    }
+
+    tif->tif_flags &= ~TIFF_MYBUFFER;
+    tif->tif_flags |= TIFF_BUFFERMMAP;
+    tif->tif_rawdatasize = insize;
+    tif->tif_rawdata = inbuf;
+    tif->tif_rawdataoff = 0;
+    tif->tif_rawdataloaded = insize;
+
+    if (!isFillOrder(tif, td->td_fillorder) &&
+        (tif->tif_flags & TIFF_NOBITREV) == 0)
+    {
+        TIFFReverseBits(inbuf, insize);
+    }
+
+    if( TIFFIsTiled(tif) )
+    {
+        if( !TIFFStartTile(tif, strile) ||
+            !(*tif->tif_decodetile)(tif, (uint8*) outbuf, outsize, 
+                                    (uint16)(strile/td->td_stripsperimage)) )
+        {
+            ret = 0;
+        }
+    }
+    else
+    {
+        uint32 rowsperstrip=td->td_rowsperstrip;
+        uint32 stripsperplane;
+        if (rowsperstrip>td->td_imagelength)
+            rowsperstrip=td->td_imagelength;
+        stripsperplane= TIFFhowmany_32_maxuint_compat(td->td_imagelength, rowsperstrip);
+        if( !TIFFStartStrip(tif, strile) ||
+            !(*tif->tif_decodestrip)(tif, (uint8*) outbuf, outsize, 
+                                     (uint16)(strile/stripsperplane)) )
+        {
+            ret = 0;
+        }
+    }
+    if( ret )
+    {
+        (*tif->tif_postdecode)(tif, (uint8*) outbuf, outsize);
+    }
+
+    if (!isFillOrder(tif, td->td_fillorder) &&
+        (tif->tif_flags & TIFF_NOBITREV) == 0)
+    {
+        TIFFReverseBits(inbuf, insize);
+    }
+
+    tif->tif_flags = old_tif_flags;
+    tif->tif_rawdatasize = old_rawdatasize;
+    tif->tif_rawdata = old_rawdata;
+    tif->tif_rawdataoff = 0;
+    tif->tif_rawdataloaded = 0;
+
+    return ret;
+}
+
 void
 _TIFFNoPostDecode(TIFF* tif, uint8* buf, tmsize_t cc)
 {
diff --git a/3rdparty/libtiff/tif_strip.c b/3rdparty/libtiff/tif_strip.c
index 5b76fba56d..c08c60a792 100644
--- a/3rdparty/libtiff/tif_strip.c
+++ b/3rdparty/libtiff/tif_strip.c
@@ -129,15 +129,8 @@ TIFFVStripSize(TIFF* tif, uint32 nrows)
 {
 	static const char module[] = "TIFFVStripSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFVStripSize64(tif,nrows);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-		n=0;
-	}
-	return(n);
+        return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -147,8 +140,7 @@ uint64
 TIFFRawStripSize64(TIFF* tif, uint32 strip)
 {
 	static const char module[] = "TIFFRawStripSize64";
-	TIFFDirectory* td = &tif->tif_dir;
-	uint64 bytecount = td->td_stripbytecount[strip];
+	uint64 bytecount = TIFFGetStrileByteCount(tif, strip);
 
 	if (bytecount == 0)
 	{
@@ -211,15 +203,8 @@ TIFFStripSize(TIFF* tif)
 {
 	static const char module[] = "TIFFStripSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFStripSize64(tif);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -330,14 +315,8 @@ TIFFScanlineSize(TIFF* tif)
 {
 	static const char module[] = "TIFFScanlineSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFScanlineSize64(tif);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m) {
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer arithmetic overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -366,15 +345,8 @@ TIFFRasterScanlineSize(TIFF* tif)
 {
 	static const char module[] = "TIFFRasterScanlineSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFRasterScanlineSize64(tif);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer arithmetic overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /* vim: set ts=8 sts=8 sw=8 noet: */
diff --git a/3rdparty/libtiff/tif_thunder.c b/3rdparty/libtiff/tif_thunder.c
index 2388dbb66b..db6383a81a 100644
--- a/3rdparty/libtiff/tif_thunder.c
+++ b/3rdparty/libtiff/tif_thunder.c
@@ -122,17 +122,17 @@ ThunderDecode(TIFF* tif, uint8* op, tmsize_t maxpixels)
 			break;
 		case THUNDER_2BITDELTAS:	/* 2-bit deltas */
 			if ((delta = ((n >> 4) & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, lastpixel + twobitdeltas[delta]);
+				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
 			if ((delta = ((n >> 2) & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, lastpixel + twobitdeltas[delta]);
+				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
 			if ((delta = (n & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, lastpixel + twobitdeltas[delta]);
+				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
 			break;
 		case THUNDER_3BITDELTAS:	/* 3-bit deltas */
 			if ((delta = ((n >> 3) & 7)) != DELTA3_SKIP)
-				SETPIXEL(op, lastpixel + threebitdeltas[delta]);
+				SETPIXEL(op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
 			if ((delta = (n & 7)) != DELTA3_SKIP)
-				SETPIXEL(op, lastpixel + threebitdeltas[delta]);
+				SETPIXEL(op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
 			break;
 		case THUNDER_RAW:		/* raw data */
 			SETPIXEL(op, n);
diff --git a/3rdparty/libtiff/tif_tile.c b/3rdparty/libtiff/tif_tile.c
index 58fe9354a3..661cc77154 100644
--- a/3rdparty/libtiff/tif_tile.c
+++ b/3rdparty/libtiff/tif_tile.c
@@ -181,15 +181,8 @@ TIFFTileRowSize(TIFF* tif)
 {
 	static const char module[] = "TIFFTileRowSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFTileRowSize64(tif);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -248,15 +241,8 @@ TIFFVTileSize(TIFF* tif, uint32 nrows)
 {
 	static const char module[] = "TIFFVTileSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFVTileSize64(tif,nrows);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -272,15 +258,8 @@ TIFFTileSize(TIFF* tif)
 {
 	static const char module[] = "TIFFTileSize";
 	uint64 m;
-	tmsize_t n;
 	m=TIFFTileSize64(tif);
-	n=(tmsize_t)m;
-	if ((uint64)n!=m)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-		n=0;
-	}
-	return(n);
+	return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
diff --git a/3rdparty/libtiff/tif_unix.c b/3rdparty/libtiff/tif_unix.c
index 874f1feb26..bea1ef7802 100644
--- a/3rdparty/libtiff/tif_unix.c
+++ b/3rdparty/libtiff/tif_unix.c
@@ -162,7 +162,7 @@ _tiffMapProc(thandle_t fd, void** pbase, toff_t* psize)
 {
 	uint64 size64 = _tiffSizeProc(fd);
 	tmsize_t sizem = (tmsize_t)size64;
-	if ((uint64)sizem==size64) {
+	if (size64 && (uint64)sizem==size64) {
 		fd_as_handle_union_t fdh;
 		fdh.h = fd;
 		*pbase = (void*)
diff --git a/3rdparty/libtiff/tif_webp.c b/3rdparty/libtiff/tif_webp.c
index 22665f2d2b..a00478f6b9 100644
--- a/3rdparty/libtiff/tif_webp.c
+++ b/3rdparty/libtiff/tif_webp.c
@@ -267,6 +267,12 @@ TWebPPreDecode(TIFF* tif, uint16 s)
       segment_height = td->td_rowsperstrip;
   }
 
+  if( segment_width > 16383 || segment_height > 16383 ) {
+      TIFFErrorExt(tif->tif_clientdata, module,
+                   "WEBP maximum image dimensions are 16383 x 16383.");
+      return 0;
+  }
+
   if( (sp->state & LSTATE_INIT_DECODE) == 0 )
       tif->tif_setupdecode(tif);
       
@@ -333,7 +339,7 @@ TWebPSetupEncode(TIFF* tif)
   }
   
   /* check bits per sample and data type */
-  if ((nBitsPerSample != 8) && (sampleFormat != 1)) {
+  if ((nBitsPerSample != 8) || (sampleFormat != SAMPLEFORMAT_UINT)) {
     TIFFErrorExt(tif->tif_clientdata, module,
                 "WEBP driver requires 8 bit unsigned data");
     return 0;
@@ -356,7 +362,7 @@ TWebPSetupEncode(TIFF* tif)
   }
 
   if (!WebPConfigInitInternal(&sp->sEncoderConfig, WEBP_PRESET_DEFAULT,
-                              sp->quality_level,
+                              (float)sp->quality_level,
                               WEBP_ENCODER_ABI_VERSION)) {
     TIFFErrorExt(tif->tif_clientdata, module,
       "Error creating WebP encoder configuration.");
@@ -579,7 +585,7 @@ TWebPVSetField(TIFF* tif, uint32 tag, va_list ap)
     #if WEBP_ENCODER_ABI_VERSION >= 0x0100
     sp->lossless = va_arg(ap, int);
     if (sp->lossless){
-      sp->quality_level = 100.0f;      
+      sp->quality_level = 100;
     }
     return 1;
     #else
@@ -628,6 +634,7 @@ TIFFInitWebP(TIFF* tif, int scheme)
   static const char module[] = "TIFFInitWebP";
   WebPState* sp;
 
+  (void)scheme;
   assert( scheme == COMPRESSION_WEBP );
 
   /*
@@ -656,7 +663,7 @@ TIFFInitWebP(TIFF* tif, int scheme)
   tif->tif_tagmethods.vsetfield = TWebPVSetField;	/* hook for codec tags */
 
   /* Default values for codec-specific fields */
-  sp->quality_level = 75.0f;		/* default comp. level */
+  sp->quality_level = 75;		/* default comp. level */
   sp->lossless = 0; /* default to false */
   sp->state = 0;
   sp->nSamples = 0;
diff --git a/3rdparty/libtiff/tif_win32.c b/3rdparty/libtiff/tif_win32.c
index 088880e7c4..8964569394 100644
--- a/3rdparty/libtiff/tif_win32.c
+++ b/3rdparty/libtiff/tif_win32.c
@@ -27,34 +27,38 @@
  * Scott Wagner (wagner@itek.com), Itek Graphix, Rochester, NY USA
  */
 
-/*
-  CreateFileA/CreateFileW return type 'HANDLE'.
-
-  thandle_t is declared like
-
-    DECLARE_HANDLE(thandle_t);
-
-  in tiffio.h.
-
-  Windows (from winnt.h) DECLARE_HANDLE logic looks like
-
-  #ifdef STRICT
-    typedef void *HANDLE;
-  #define DECLARE_HANDLE(name) struct name##__ { int unused; }; typedef struct name##__ *name
-  #else
-    typedef PVOID HANDLE;
-  #define DECLARE_HANDLE(name) typedef HANDLE name
-  #endif
-
-  See http://bugzilla.maptools.org/show_bug.cgi?id=1941 for problems in WIN64
-  builds resulting from this.  Unfortunately, the proposed patch was lost.
-
-*/
-  
 #include "tiffiop.h"
 
 #include <windows.h>
 
+/*
+  CreateFileA/CreateFileW return type 'HANDLE' while TIFFFdOpen() takes 'int',
+  which is formally incompatible and can even seemingly be of different size:
+  HANDLE is 64 bit under Win64, while int is still 32 bits there.
+
+  However, only the lower 32 bits of a HANDLE are significant under Win64 as,
+  for interoperability reasons, they must have the same values in 32- and
+  64-bit programs running on the same system, see
+
+  https://docs.microsoft.com/en-us/windows/win32/winprog64/interprocess-communication
+
+  Because of this, it is safe to define the following trivial functions for
+  casting between ints and HANDLEs, which are only really needed to avoid
+  compiler warnings (and, perhaps, to make the code slightly more clear).
+  Note that using the intermediate cast to "intptr_t" is crucial for warning
+  avoidance, as this integer type has the same size as HANDLE in all builds.
+*/
+
+static inline thandle_t thandle_from_int(int ifd)
+{
+    return (thandle_t)(intptr_t)ifd;
+}
+
+static inline int thandle_to_int(thandle_t fd)
+{
+    return (int)(intptr_t)fd;
+}
+
 static tmsize_t
 _tiffReadProc(thandle_t fd, void* buf, tmsize_t size)
 {
@@ -151,9 +155,11 @@ _tiffCloseProc(thandle_t fd)
 static uint64
 _tiffSizeProc(thandle_t fd)
 {
-	ULARGE_INTEGER m;
-	m.LowPart=GetFileSize(fd,&m.HighPart);
-	return(m.QuadPart);
+	LARGE_INTEGER m;
+	if (GetFileSizeEx(fd,&m))
+		return(m.QuadPart);
+	else
+		return(0);
 }
 
 static int
@@ -185,7 +191,7 @@ _tiffMapProc(thandle_t fd, void** pbase, toff_t* psize)
 
 	size = _tiffSizeProc(fd);
 	sizem = (tmsize_t)size;
-	if ((uint64)sizem!=size)
+	if (!size || (uint64)sizem!=size)
 		return (0);
 
 	/* By passing in 0 for the maximum file size, it specifies that we
@@ -237,7 +243,7 @@ TIFFFdOpen(int ifd, const char* name, const char* mode)
 			break;
 		}
 	}
-	tif = TIFFClientOpen(name, mode, (thandle_t)ifd, /* FIXME: WIN64 cast to pointer warning */
+	tif = TIFFClientOpen(name, mode, thandle_from_int(ifd),
 			_tiffReadProc, _tiffWriteProc,
 			_tiffSeekProc, _tiffCloseProc, _tiffSizeProc,
 			fSuppressMap ? _tiffDummyMapProc : _tiffMapProc,
@@ -282,7 +288,7 @@ TIFFOpen(const char* name, const char* mode)
 		return ((TIFF *)0);
 	}
 
-	tif = TIFFFdOpen((int)fd, name, mode);   /* FIXME: WIN64 cast from pointer to int warning */
+	tif = TIFFFdOpen(thandle_to_int(fd), name, mode);
 	if(!tif)
 		CloseHandle(fd);
 	return tif;
@@ -337,7 +343,7 @@ TIFFOpenW(const wchar_t* name, const char* mode)
 				    NULL, NULL);
 	}
 
-	tif = TIFFFdOpen((int)fd,    /* FIXME: WIN64 cast from pointer to int warning */
+	tif = TIFFFdOpen(thandle_to_int(fd),
 			 (mbname != NULL) ? mbname : "<unknown>", mode);
 	if(!tif)
 		CloseHandle(fd);
diff --git a/3rdparty/libtiff/tif_write.c b/3rdparty/libtiff/tif_write.c
index a31ecd12c1..3af69ab4e7 100644
--- a/3rdparty/libtiff/tif_write.c
+++ b/3rdparty/libtiff/tif_write.c
@@ -128,10 +128,10 @@ TIFFWriteScanline(TIFF* tif, void* buf, uint32 row, uint16 sample)
 		tif->tif_rawcc = 0;
 		tif->tif_rawcp = tif->tif_rawdata;
 
-		if( td->td_stripbytecount[strip] > 0 )
+		if( td->td_stripbytecount_p[strip] > 0 )
 		{
 			/* if we are writing over existing tiles, zero length */
-			td->td_stripbytecount[strip] = 0;
+			td->td_stripbytecount_p[strip] = 0;
 
 			/* this forces TIFFAppendToStrip() to do a seek */
 			tif->tif_curoff = 0;
@@ -176,6 +176,32 @@ TIFFWriteScanline(TIFF* tif, void* buf, uint32 row, uint16 sample)
 	return (status);
 }
 
+/* Make sure that at the first attempt of rewriting a tile/strip, we will have */
+/* more bytes available in the output buffer than the previous byte count, */
+/* so that TIFFAppendToStrip() will detect the overflow when it is called the first */
+/* time if the new compressed tile is bigger than the older one. (GDAL #4771) */
+static int _TIFFReserveLargeEnoughWriteBuffer(TIFF* tif, uint32 strip_or_tile)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    if( td->td_stripbytecount_p[strip_or_tile] > 0 )
+    {
+        /* The +1 is to ensure at least one extra bytes */
+        /* The +4 is because the LZW encoder flushes 4 bytes before the limit */
+        uint64 safe_buffer_size = (uint64)(td->td_stripbytecount_p[strip_or_tile] + 1 + 4);
+        if( tif->tif_rawdatasize <= (tmsize_t)safe_buffer_size )
+        {
+            if( !(TIFFWriteBufferSetup(tif, NULL,
+                (tmsize_t)TIFFroundup_64(safe_buffer_size, 1024))) )
+                return 0;
+        }
+
+        /* Force TIFFAppendToStrip() to consider placing data at end
+            of file. */
+        tif->tif_curoff = 0;
+    }
+    return 1;
+}
+
 /*
  * Encode the supplied data and write it to the
  * specified strip.
@@ -222,6 +248,13 @@ TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc)
         tif->tif_flags |= TIFF_BUF4WRITE;
 	tif->tif_curstrip = strip;
 
+	if( !_TIFFReserveLargeEnoughWriteBuffer(tif, strip) ) {
+            return ((tmsize_t)(-1));
+        }
+
+        tif->tif_rawcc = 0;
+        tif->tif_rawcp = tif->tif_rawdata;
+
         if (td->td_stripsperimage == 0) {
                 TIFFErrorExt(tif->tif_clientdata, module, "Zero strips per image");
                 return ((tmsize_t) -1);
@@ -234,27 +267,6 @@ TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc)
 		tif->tif_flags |= TIFF_CODERSETUP;
 	}
 
-	if( td->td_stripbytecount[strip] > 0 )
-        {
-            /* Make sure that at the first attempt of rewriting the tile, we will have */
-            /* more bytes available in the output buffer than the previous byte count, */
-            /* so that TIFFAppendToStrip() will detect the overflow when it is called the first */
-            /* time if the new compressed tile is bigger than the older one. (GDAL #4771) */
-            if( tif->tif_rawdatasize <= (tmsize_t)td->td_stripbytecount[strip] )
-            {
-                if( !(TIFFWriteBufferSetup(tif, NULL,
-                    (tmsize_t)TIFFroundup_64((uint64)(td->td_stripbytecount[strip] + 1), 1024))) )
-                    return ((tmsize_t)(-1));
-            }
-
-	    /* Force TIFFAppendToStrip() to consider placing data at end
-               of file. */
-            tif->tif_curoff = 0;
-        }
-
-    tif->tif_rawcc = 0;
-    tif->tif_rawcp = tif->tif_rawdata;
-
 	tif->tif_flags &= ~TIFF_POSTENCODE;
 
     /* shortcut to avoid an extra memcpy() */
@@ -402,22 +414,8 @@ TIFFWriteEncodedTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc)
         tif->tif_flags |= TIFF_BUF4WRITE;
 	tif->tif_curtile = tile;
 
-	if( td->td_stripbytecount[tile] > 0 )
-        {
-            /* Make sure that at the first attempt of rewriting the tile, we will have */
-            /* more bytes available in the output buffer than the previous byte count, */
-            /* so that TIFFAppendToStrip() will detect the overflow when it is called the first */
-            /* time if the new compressed tile is bigger than the older one. (GDAL #4771) */
-            if( tif->tif_rawdatasize <= (tmsize_t) td->td_stripbytecount[tile] )
-            {
-                if( !(TIFFWriteBufferSetup(tif, NULL,
-                    (tmsize_t)TIFFroundup_64((uint64)(td->td_stripbytecount[tile] + 1), 1024))) )
-                    return ((tmsize_t)(-1));
-            }
-
-	    /* Force TIFFAppendToStrip() to consider placing data at end
-               of file. */
-            tif->tif_curoff = 0;
+        if( !_TIFFReserveLargeEnoughWriteBuffer(tif, tile) ) {
+            return ((tmsize_t)(-1));
         }
 
 	tif->tif_rawcc = 0;
@@ -535,22 +533,29 @@ TIFFSetupStrips(TIFF* tif)
 		    isUnspecified(tif, FIELD_ROWSPERSTRIP) ?
 			td->td_samplesperpixel : TIFFNumberOfStrips(tif);
 	td->td_nstrips = td->td_stripsperimage;
+        /* TIFFWriteDirectoryTagData has a limitation to 0x80000000U bytes */
+        if( td->td_nstrips >= 0x80000000U / ((tif->tif_flags&TIFF_BIGTIFF)?0x8U:0x4U) )
+        {
+            TIFFErrorExt(tif->tif_clientdata, "TIFFSetupStrips",
+                         "Too large Strip/Tile Offsets/ByteCounts arrays");
+            return 0;
+        }
 	if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
 		td->td_stripsperimage /= td->td_samplesperpixel;
-	td->td_stripoffset = (uint64 *)
+	td->td_stripoffset_p = (uint64 *)
             _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
                              "for \"StripOffsets\" array");
-	td->td_stripbytecount = (uint64 *)
+	td->td_stripbytecount_p = (uint64 *)
             _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
                              "for \"StripByteCounts\" array");
-	if (td->td_stripoffset == NULL || td->td_stripbytecount == NULL)
+	if (td->td_stripoffset_p == NULL || td->td_stripbytecount_p == NULL)
 		return (0);
 	/*
 	 * Place data at the end-of-file
 	 * (by setting offsets to zero).
 	 */
-	_TIFFmemset(td->td_stripoffset, 0, td->td_nstrips*sizeof (uint64));
-	_TIFFmemset(td->td_stripbytecount, 0, td->td_nstrips*sizeof (uint64));
+	_TIFFmemset(td->td_stripoffset_p, 0, td->td_nstrips*sizeof (uint64));
+	_TIFFmemset(td->td_stripbytecount_p, 0, td->td_nstrips*sizeof (uint64));
 	TIFFSetFieldBit(tif, FIELD_STRIPOFFSETS);
 	TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
 	return (1);
@@ -572,7 +577,7 @@ TIFFWriteCheck(TIFF* tif, int tiles, const char* module)
 	}
 	if (tiles ^ isTiled(tif)) {
 		TIFFErrorExt(tif->tif_clientdata, module, tiles ?
-		    "Can not write tiles to a stripped image" :
+		    "Can not write tiles to a striped image" :
 		    "Can not write scanlines to a tiled image");
 		return (0);
 	}
@@ -610,7 +615,7 @@ TIFFWriteCheck(TIFF* tif, int tiles, const char* module)
 			return (0);
 		}
 	}
-	if (tif->tif_dir.td_stripoffset == NULL && !TIFFSetupStrips(tif)) {
+	if (tif->tif_dir.td_stripoffset_p == NULL && !TIFFSetupStrips(tif)) {
 		tif->tif_dir.td_nstrips = 0;
 		TIFFErrorExt(tif->tif_clientdata, module, "No space for %s arrays",
 		    isTiled(tif) ? "tile" : "strip");
@@ -628,6 +633,20 @@ TIFFWriteCheck(TIFF* tif, int tiles, const char* module)
 	if (tif->tif_scanlinesize == 0)
 		return (0);
 	tif->tif_flags |= TIFF_BEENWRITING;
+
+        if( tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+            tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+            tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 &&
+            !(tif->tif_flags & TIFF_DIRTYDIRECT)  )
+        {
+            TIFFForceStrileArrayWriting(tif);
+        }
+
 	return (1);
 }
 
@@ -649,6 +668,10 @@ TIFFWriteBufferSetup(TIFF* tif, void* bp, tmsize_t size)
 	if (size == (tmsize_t)(-1)) {
 		size = (isTiled(tif) ?
 		    tif->tif_tilesize : TIFFStripSize(tif));
+
+                /* Adds 10% margin for cases where compression would expand a bit */
+                if( size < TIFF_TMSIZE_T_MAX - size / 10 )
+                    size += size / 10;
 		/*
 		 * Make raw data buffer at least 8K
 		 */
@@ -684,9 +707,9 @@ TIFFGrowStrips(TIFF* tif, uint32 delta, const char* module)
 	uint64* new_stripbytecount;
 
 	assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
-	new_stripoffset = (uint64*)_TIFFrealloc(td->td_stripoffset,
+	new_stripoffset = (uint64*)_TIFFrealloc(td->td_stripoffset_p,
 		(td->td_nstrips + delta) * sizeof (uint64));
-	new_stripbytecount = (uint64*)_TIFFrealloc(td->td_stripbytecount,
+	new_stripbytecount = (uint64*)_TIFFrealloc(td->td_stripbytecount_p,
 		(td->td_nstrips + delta) * sizeof (uint64));
 	if (new_stripoffset == NULL || new_stripbytecount == NULL) {
 		if (new_stripoffset)
@@ -697,11 +720,11 @@ TIFFGrowStrips(TIFF* tif, uint32 delta, const char* module)
 		TIFFErrorExt(tif->tif_clientdata, module, "No space to expand strip arrays");
 		return (0);
 	}
-	td->td_stripoffset = new_stripoffset;
-	td->td_stripbytecount = new_stripbytecount;
-	_TIFFmemset(td->td_stripoffset + td->td_nstrips,
+	td->td_stripoffset_p = new_stripoffset;
+	td->td_stripbytecount_p = new_stripbytecount;
+	_TIFFmemset(td->td_stripoffset_p + td->td_nstrips,
 		    0, delta*sizeof (uint64));
-	_TIFFmemset(td->td_stripbytecount + td->td_nstrips,
+	_TIFFmemset(td->td_stripbytecount_p + td->td_nstrips,
 		    0, delta*sizeof (uint64));
 	td->td_nstrips += delta;
         tif->tif_flags |= TIFF_DIRTYDIRECT;
@@ -720,12 +743,12 @@ TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
 	uint64 m;
         int64 old_byte_count = -1;
 
-	if (td->td_stripoffset[strip] == 0 || tif->tif_curoff == 0) {
+	if (td->td_stripoffset_p[strip] == 0 || tif->tif_curoff == 0) {
             assert(td->td_nstrips > 0);
 
-            if( td->td_stripbytecount[strip] != 0 
-                && td->td_stripoffset[strip] != 0 
-                && td->td_stripbytecount[strip] >= (uint64) cc )
+            if( td->td_stripbytecount_p[strip] != 0 
+                && td->td_stripoffset_p[strip] != 0 
+                && td->td_stripbytecount_p[strip] >= (uint64) cc )
             {
                 /* 
                  * There is already tile data on disk, and the new tile
@@ -734,7 +757,7 @@ TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
                  * more data to append to this strip before we are done
                  * depending on how we are getting called.
                  */
-                if (!SeekOK(tif, td->td_stripoffset[strip])) {
+                if (!SeekOK(tif, td->td_stripoffset_p[strip])) {
                     TIFFErrorExt(tif->tif_clientdata, module,
                                  "Seek error at scanline %lu",
                                  (unsigned long)tif->tif_row);
@@ -747,17 +770,17 @@ TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
                  * Seek to end of file, and set that as our location to 
                  * write this strip.
                  */
-                td->td_stripoffset[strip] = TIFFSeekFile(tif, 0, SEEK_END);
+                td->td_stripoffset_p[strip] = TIFFSeekFile(tif, 0, SEEK_END);
                 tif->tif_flags |= TIFF_DIRTYSTRIP;
             }
 
-            tif->tif_curoff = td->td_stripoffset[strip];
+            tif->tif_curoff = td->td_stripoffset_p[strip];
 
             /*
              * We are starting a fresh strip/tile, so set the size to zero.
              */
-            old_byte_count = td->td_stripbytecount[strip];
-            td->td_stripbytecount[strip] = 0;
+            old_byte_count = td->td_stripbytecount_p[strip];
+            td->td_stripbytecount_p[strip] = 0;
 	}
 
 	m = tif->tif_curoff+cc;
@@ -774,9 +797,9 @@ TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
 		    return (0);
 	}
 	tif->tif_curoff = m;
-	td->td_stripbytecount[strip] += cc;
+	td->td_stripbytecount_p[strip] += cc;
 
-        if( (int64) td->td_stripbytecount[strip] != old_byte_count )
+        if( (int64) td->td_stripbytecount_p[strip] != old_byte_count )
             tif->tif_flags |= TIFF_DIRTYSTRIP;
             
 	return (1);
diff --git a/3rdparty/libtiff/tif_zip.c b/3rdparty/libtiff/tif_zip.c
index c75077349e..e71c312c80 100644
--- a/3rdparty/libtiff/tif_zip.c
+++ b/3rdparty/libtiff/tif_zip.c
@@ -29,24 +29,22 @@
  *
  * ZIP (aka Deflate) Compression Support
  *
- * This file is simply an interface to the zlib library written by
+ * This file is an interface to the zlib library written by
  * Jean-loup Gailly and Mark Adler.  You must use version 1.0 or later
- * of the library: this code assumes the 1.0 API and also depends on
- * the ability to write the zlib header multiple times (one per strip)
- * which was not possible with versions prior to 0.95.  Note also that
- * older versions of this codec avoided this bug by suppressing the header
- * entirely.  This means that files written with the old library cannot
- * be read; they should be converted to a different compression scheme
- * and then reconverted.
+ * of the library.
  *
- * The data format used by the zlib library is described in the files
- * zlib-3.1.doc, deflate-1.1.doc and gzip-4.1.doc, available in the
- * directory ftp://ftp.uu.net/pub/archiving/zip/doc.  The library was
- * last found at ftp://ftp.uu.net/pub/archiving/zip/zlib/zlib-0.99.tar.gz.
+ * Optionally, libdeflate (https://github.com/ebiggers/libdeflate) may be used
+ * to do the compression and decompression, but only for whole strips and tiles.
+ * For scanline access, zlib will be sued as a fallback.
  */
 #include "tif_predict.h"
 #include "zlib.h"
 
+#if LIBDEFLATE_SUPPORT
+#include "libdeflate.h"
+#endif
+#define LIBDEFLATE_MAX_COMPRESSION_LEVEL 12
+
 #include <stdio.h>
 
 /*
@@ -70,6 +68,12 @@ typedef struct {
         z_stream        stream;
 	int             zipquality;            /* compression level */
 	int             state;                 /* state flags */
+	int             subcodec;              /* DEFLATE_SUBCODEC_ZLIB or DEFLATE_SUBCODEC_LIBDEFLATE */
+#if LIBDEFLATE_SUPPORT
+	int             libdeflate_state;       /* -1 = until first time ZIPEncode() / ZIPDecode() is called, 0 = use zlib, 1 = use libdeflate */
+	struct libdeflate_decompressor* libdeflate_dec;
+	struct libdeflate_compressor*   libdeflate_enc;
+#endif
 #define ZSTATE_INIT_DECODE 0x01
 #define ZSTATE_INIT_ENCODE 0x02
 
@@ -132,6 +136,9 @@ ZIPPreDecode(TIFF* tif, uint16 s)
 	if( (sp->state & ZSTATE_INIT_DECODE) == 0 )
             tif->tif_setupdecode( tif );
 
+#if LIBDEFLATE_SUPPORT
+        sp->libdeflate_state = -1;
+#endif
 	sp->stream.next_in = tif->tif_rawdata;
 	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
 	    we need to simplify this code to reflect a ZLib that is likely updated
@@ -151,6 +158,77 @@ ZIPDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
 	assert(sp != NULL);
 	assert(sp->state == ZSTATE_INIT_DECODE);
 
+#if LIBDEFLATE_SUPPORT
+        if( sp->libdeflate_state == 1 )
+            return 0;
+
+        /* If we have libdeflate support and we are asked to read a whole */
+        /* strip/tile, then go for using it */
+        do {
+            TIFFDirectory *td = &tif->tif_dir;
+
+            if( sp->libdeflate_state == 0 )
+                break;
+            if( sp->subcodec == DEFLATE_SUBCODEC_ZLIB )
+                break;
+
+            /* Check if we are in the situation where we can use libdeflate */
+            if (isTiled(tif)) {
+                if( TIFFTileSize64(tif) != (uint64)occ )
+                    break;
+            } else {
+                uint32 strip_height = td->td_imagelength - tif->tif_row;
+                if (strip_height > td->td_rowsperstrip)
+                    strip_height = td->td_rowsperstrip;
+                if( TIFFVStripSize64(tif, strip_height) != (uint64)occ )
+                    break;
+            }
+
+            /* Check for overflow */
+            if( (size_t)tif->tif_rawcc != (uint64)tif->tif_rawcc )
+                break;
+            if( (size_t)occ != (uint64)occ )
+                break;
+
+            /* Go for decompression using libdeflate */
+            {
+                enum libdeflate_result res;
+                if( sp->libdeflate_dec == NULL )
+                {
+                    sp->libdeflate_dec = libdeflate_alloc_decompressor();
+                    if( sp->libdeflate_dec == NULL )
+                    {
+                        break;
+                    }
+                }
+
+                sp->libdeflate_state = 1;
+
+                res = libdeflate_zlib_decompress(
+                    sp->libdeflate_dec, tif->tif_rawcp, (size_t)tif->tif_rawcc, op, (size_t)occ, NULL);
+
+                tif->tif_rawcp += tif->tif_rawcc;
+                tif->tif_rawcc = 0;
+
+                /* We accept LIBDEFLATE_INSUFFICIENT_SPACE has a return */
+                /* There are odd files in the wild where the last strip, when */
+                /* it is smaller in height than td_rowsperstrip, actually contains */
+                /* data for td_rowsperstrip lines. Just ignore that silently. */
+                if( res != LIBDEFLATE_SUCCESS &&
+                    res != LIBDEFLATE_INSUFFICIENT_SPACE )
+                {
+                    TIFFErrorExt(tif->tif_clientdata, module,
+                                 "Decoding error at scanline %lu",
+                                 (unsigned long) tif->tif_row);
+                    return 0;
+                }
+
+                return 1;
+            }
+        } while(0);
+        sp->libdeflate_state = 0;
+#endif /* LIBDEFLATE_SUPPORT */
+
         sp->stream.next_in = tif->tif_rawcp;
         
 	sp->stream.next_out = op;
@@ -198,6 +276,7 @@ ZIPSetupEncode(TIFF* tif)
 {
 	static const char module[] = "ZIPSetupEncode";
 	ZIPState* sp = EncoderState(tif);
+        int cappedQuality;
 
 	assert(sp != NULL);
 	if (sp->state & ZSTATE_INIT_DECODE) {
@@ -205,7 +284,11 @@ ZIPSetupEncode(TIFF* tif)
 		sp->state = 0;
 	}
 
-	if (deflateInit(&sp->stream, sp->zipquality) != Z_OK) {
+        cappedQuality = sp->zipquality;
+        if( cappedQuality > Z_BEST_COMPRESSION )
+            cappedQuality = Z_BEST_COMPRESSION;
+
+	if (deflateInit(&sp->stream, cappedQuality) != Z_OK) {
 		TIFFErrorExt(tif->tif_clientdata, module, "%s", SAFE_MSG(sp));
 		return (0);
 	} else {
@@ -227,6 +310,9 @@ ZIPPreEncode(TIFF* tif, uint16 s)
 	if( sp->state != ZSTATE_INIT_ENCODE )
             tif->tif_setupencode( tif );
 
+#if LIBDEFLATE_SUPPORT
+        sp->libdeflate_state = -1;
+#endif
 	sp->stream.next_out = tif->tif_rawdata;
 	assert(sizeof(sp->stream.avail_out)==4);  /* if this assert gets raised,
 	    we need to simplify this code to reflect a ZLib that is likely updated
@@ -249,6 +335,95 @@ ZIPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 	assert(sp->state == ZSTATE_INIT_ENCODE);
 
 	(void) s;
+
+#if LIBDEFLATE_SUPPORT
+        if( sp->libdeflate_state == 1 )
+            return 0;
+
+        /* If we have libdeflate support and we are asked to write a whole */
+        /* strip/tile, then go for using it */
+        do {
+            TIFFDirectory *td = &tif->tif_dir;
+
+            if( sp->libdeflate_state == 0 )
+                break;
+            if( sp->subcodec == DEFLATE_SUBCODEC_ZLIB )
+                break;
+
+            /* Libdeflate does not support the 0-compression level */
+            if( sp->zipquality == Z_NO_COMPRESSION )
+                break;
+
+            /* Check if we are in the situation where we can use libdeflate */
+            if (isTiled(tif)) {
+                if( TIFFTileSize64(tif) != (uint64)cc )
+                    break;
+            } else {
+                uint32 strip_height = td->td_imagelength - tif->tif_row;
+                if (strip_height > td->td_rowsperstrip)
+                    strip_height = td->td_rowsperstrip;
+                if( TIFFVStripSize64(tif, strip_height) != (uint64)cc )
+                    break;
+            }
+
+            /* Check for overflow */
+            if( (size_t)tif->tif_rawdatasize != (uint64)tif->tif_rawdatasize )
+                break;
+            if( (size_t)cc != (uint64)cc )
+                break;
+
+            /* Go for compression using libdeflate */
+            {
+                size_t nCompressedBytes;
+                if( sp->libdeflate_enc == NULL )
+                {
+                    /* To get results as good as zlib, we asked for an extra */
+                    /* level of compression */
+                    sp->libdeflate_enc = libdeflate_alloc_compressor(
+                        sp->zipquality == Z_DEFAULT_COMPRESSION ? 7 :
+                        sp->zipquality >= 6 && sp->zipquality <= 9 ? sp->zipquality + 1 :
+                        sp->zipquality);
+                    if( sp->libdeflate_enc == NULL )
+                    {
+                        TIFFErrorExt(tif->tif_clientdata, module,
+                                    "Cannot allocate compressor");
+                        break;
+                    }
+                }
+
+                /* Make sure the output buffer is large enough for the worse case. */
+                /* In TIFFWriteBufferSetup(), when libtiff allocates the buffer */
+                /* we've taken a 10% margin over the uncompressed size, which should */
+                /* be large enough even for the the worse case scenario. */
+                if( libdeflate_zlib_compress_bound(sp->libdeflate_enc, (size_t)cc) >
+                        (size_t)tif->tif_rawdatasize)
+                {
+                    break;
+                }
+
+                sp->libdeflate_state = 1;
+                nCompressedBytes = libdeflate_zlib_compress(
+                    sp->libdeflate_enc, bp, (size_t)cc, tif->tif_rawdata, (size_t)tif->tif_rawdatasize);
+
+                if( nCompressedBytes == 0 )
+                {
+                    TIFFErrorExt(tif->tif_clientdata, module,
+                                 "Encoder error at scanline %lu",
+                                 (unsigned long) tif->tif_row);
+                    return 0;
+                }
+
+                tif->tif_rawcc = nCompressedBytes;
+
+                if( !TIFFFlushData1(tif) )
+                    return 0;
+
+                return 1;
+            }
+        } while(0);
+        sp->libdeflate_state = 0;
+#endif /* LIBDEFLATE_SUPPORT */
+
 	sp->stream.next_in = bp;
 	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
 	    we need to simplify this code to reflect a ZLib that is likely updated
@@ -265,7 +440,8 @@ ZIPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 		}
 		if (sp->stream.avail_out == 0) {
 			tif->tif_rawcc = tif->tif_rawdatasize;
-			TIFFFlushData1(tif);
+			if (!TIFFFlushData1(tif))
+				return 0;
 			sp->stream.next_out = tif->tif_rawdata;
 			sp->stream.avail_out = (uint64)tif->tif_rawdatasize <= 0xFFFFFFFFU ? (uInt)tif->tif_rawdatasize : 0xFFFFFFFFU;
 		}
@@ -285,6 +461,11 @@ ZIPPostEncode(TIFF* tif)
 	ZIPState *sp = EncoderState(tif);
 	int state;
 
+#if LIBDEFLATE_SUPPORT
+        if( sp->libdeflate_state == 1 )
+            return 1;
+#endif
+
 	sp->stream.avail_in = 0;
 	do {
 		state = deflate(&sp->stream, Z_FINISH);
@@ -294,7 +475,8 @@ ZIPPostEncode(TIFF* tif)
 			if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
 			{
 				tif->tif_rawcc =  tif->tif_rawdatasize - sp->stream.avail_out;
-				TIFFFlushData1(tif);
+				if (!TIFFFlushData1(tif))
+					return 0;
 				sp->stream.next_out = tif->tif_rawdata;
 				sp->stream.avail_out = (uint64)tif->tif_rawdatasize <= 0xFFFFFFFFU ? (uInt)tif->tif_rawdatasize : 0xFFFFFFFFU;
 			}
@@ -327,6 +509,14 @@ ZIPCleanup(TIFF* tif)
 		inflateEnd(&sp->stream);
 		sp->state = 0;
 	}
+
+#if LIBDEFLATE_SUPPORT
+        if( sp->libdeflate_dec )
+            libdeflate_free_decompressor(sp->libdeflate_dec);
+        if( sp->libdeflate_enc )
+            libdeflate_free_compressor(sp->libdeflate_enc);
+#endif
+
 	_TIFFfree(sp);
 	tif->tif_data = NULL;
 
@@ -342,15 +532,55 @@ ZIPVSetField(TIFF* tif, uint32 tag, va_list ap)
 	switch (tag) {
 	case TIFFTAG_ZIPQUALITY:
 		sp->zipquality = (int) va_arg(ap, int);
-		if ( sp->state&ZSTATE_INIT_ENCODE ) {
+                if( sp->zipquality < Z_DEFAULT_COMPRESSION ||
+                    sp->zipquality > LIBDEFLATE_MAX_COMPRESSION_LEVEL ) {
+                    TIFFErrorExt(tif->tif_clientdata, module,
+                                 "Invalid ZipQuality value. Should be in [-1,%d] range",
+                                 LIBDEFLATE_MAX_COMPRESSION_LEVEL);
+                    return 0;
+                }
+
+                if ( sp->state&ZSTATE_INIT_ENCODE ) {
+                        int cappedQuality = sp->zipquality;
+                        if( cappedQuality > Z_BEST_COMPRESSION )
+                            cappedQuality = Z_BEST_COMPRESSION;
 			if (deflateParams(&sp->stream,
-			    sp->zipquality, Z_DEFAULT_STRATEGY) != Z_OK) {
+			    cappedQuality, Z_DEFAULT_STRATEGY) != Z_OK) {
 				TIFFErrorExt(tif->tif_clientdata, module, "ZLib error: %s",
 					     SAFE_MSG(sp));
 				return (0);
 			}
 		}
+
+#if LIBDEFLATE_SUPPORT
+                if( sp->libdeflate_enc )
+                {
+                    libdeflate_free_compressor(sp->libdeflate_enc);
+                    sp->libdeflate_enc = NULL;
+                }
+#endif
+
 		return (1);
+
+        case TIFFTAG_DEFLATE_SUBCODEC:
+                sp->subcodec = (int) va_arg(ap, int);
+                if( sp->subcodec != DEFLATE_SUBCODEC_ZLIB &&
+                    sp->subcodec != DEFLATE_SUBCODEC_LIBDEFLATE )
+                {
+                    TIFFErrorExt(tif->tif_clientdata, module,
+                                 "Invalid DeflateCodec value.");
+                    return 0;
+                }
+#if !LIBDEFLATE_SUPPORT
+                if( sp->subcodec == DEFLATE_SUBCODEC_LIBDEFLATE )
+                {
+                    TIFFErrorExt(tif->tif_clientdata, module,
+                                 "DeflateCodec = DEFLATE_SUBCODEC_LIBDEFLATE unsupported in this build");
+                    return 0;
+                }
+#endif
+                return 1;
+
 	default:
 		return (*sp->vsetparent)(tif, tag, ap);
 	}
@@ -366,6 +596,11 @@ ZIPVGetField(TIFF* tif, uint32 tag, va_list ap)
 	case TIFFTAG_ZIPQUALITY:
 		*va_arg(ap, int*) = sp->zipquality;
 		break;
+
+        case TIFFTAG_DEFLATE_SUBCODEC:
+		*va_arg(ap, int*) = sp->subcodec;
+		break;
+
 	default:
 		return (*sp->vgetparent)(tif, tag, ap);
 	}
@@ -374,6 +609,7 @@ ZIPVGetField(TIFF* tif, uint32 tag, va_list ap)
 
 static const TIFFField zipFields[] = {
     { TIFFTAG_ZIPQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL },
+    { TIFFTAG_DEFLATE_SUBCODEC, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL },
 };
 
 int
@@ -384,6 +620,9 @@ TIFFInitZIP(TIFF* tif, int scheme)
 
 	assert( (scheme == COMPRESSION_DEFLATE)
 		|| (scheme == COMPRESSION_ADOBE_DEFLATE));
+#ifdef NDEBUG
+	(void)scheme;
+#endif
 
 	/*
 	 * Merge codec-specific tag information.
@@ -397,7 +636,7 @@ TIFFInitZIP(TIFF* tif, int scheme)
 	/*
 	 * Allocate state block so tag methods have storage to record values.
 	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof (ZIPState));
+	tif->tif_data = (uint8*) _TIFFcalloc(sizeof (ZIPState), 1);
 	if (tif->tif_data == NULL)
 		goto bad;
 	sp = ZState(tif);
@@ -417,6 +656,11 @@ TIFFInitZIP(TIFF* tif, int scheme)
 	/* Default values for codec-specific fields */
 	sp->zipquality = Z_DEFAULT_COMPRESSION;	/* default comp. level */
 	sp->state = 0;
+#if LIBDEFLATE_SUPPORT
+        sp->subcodec = DEFLATE_SUBCODEC_LIBDEFLATE;
+#else
+        sp->subcodec = DEFLATE_SUBCODEC_ZLIB;
+#endif
 
 	/*
 	 * Install codec methods.
diff --git a/3rdparty/libtiff/tif_zstd.c b/3rdparty/libtiff/tif_zstd.c
index 21c935e2da..66135e03c1 100644
--- a/3rdparty/libtiff/tif_zstd.c
+++ b/3rdparty/libtiff/tif_zstd.c
@@ -260,7 +260,8 @@ ZSTDEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
                 }
                 if( sp->out_buffer.pos == sp->out_buffer.size ) {
                         tif->tif_rawcc = tif->tif_rawdatasize;
-                        TIFFFlushData1(tif);
+                        if (!TIFFFlushData1(tif))
+                                return 0;
                         sp->out_buffer.dst = tif->tif_rawcp;
                         sp->out_buffer.pos = 0;
                 }
@@ -289,7 +290,8 @@ ZSTDPostEncode(TIFF* tif)
                 }
                 if( sp->out_buffer.pos > 0 ) {
                         tif->tif_rawcc = sp->out_buffer.pos;
-                        TIFFFlushData1(tif);
+                        if (!TIFFFlushData1(tif))
+                                return 0;
                         sp->out_buffer.dst = tif->tif_rawcp;
                         sp->out_buffer.pos = 0;
                 }
diff --git a/3rdparty/libtiff/tiff.h b/3rdparty/libtiff/tiff.h
index 5b0a0c90f6..2d4a47679d 100644
--- a/3rdparty/libtiff/tiff.h
+++ b/3rdparty/libtiff/tiff.h
@@ -119,6 +119,11 @@ typedef struct {
  * Tag data type information.
  *
  * Note: RATIONALs are the ratio of two 32-bit integer values.
+ *--:
+ * Note2: TIFF_IFD8 data type is used in tiffFields[]-tag definition in order to distinguish the write-handling 
+          of those tags between ClassicTIFF and BigTiff:
+		  For ClassicTIFF libtiff writes a 32-bit value and the TIFF_IFD type-id into the file
+		  For BigTIFF     libtiff writes a 64-bit value and the TIFF_IFD8 type-id into the file
  */
 typedef enum {
 	TIFF_NOTYPE = 0,      /* placeholder */
@@ -375,6 +380,7 @@ typedef enum {
 						   January 2004 */
 #define TIFFTAG_OPIIMAGEID		32781	/* %OPI ImageID
 						   [Adobe TIFF technote] */
+#define	TIFFTAG_TIFFANNOTATIONDATA	32932	/* http://web.archive.org/web/20050309141348/http://www.kofile.com/support%20pro/faqs/annospec.htm */
 /* tags 32952-32956 are private tags registered to Island Graphics */
 #define TIFFTAG_REFPTS			32953	/* image reference points */
 #define TIFFTAG_REGIONTACKPOINT		32954	/* region-xform tack point */
@@ -409,8 +415,23 @@ typedef enum {
 #define TIFFTAG_CFAPATTERN		33422	/* color filter array pattern */
 /* tag 33432 is listed in the 6.0 spec w/ unknown ownership */
 #define	TIFFTAG_COPYRIGHT		33432	/* copyright string */
+/* Tags 33445-33452 are used for GEL fileformat, see
+ * http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf
+ */
+#define	TIFFTAG_MD_FILETAG		33445	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_SCALEPIXEL	33446	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_COLORTABLE	33447	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_LABNAME	33448	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_SAMPLEINFO	33449	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_PREPDATE	33450	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_PREPTIME	33451	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define	TIFFTAG_MD_FILEUNITS	33452	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
 /* IPTC TAG from RichTIFF specifications */
 #define TIFFTAG_RICHTIFFIPTC		33723
+#define	TIFFTAG_INGR_PACKET_DATA_TAG	33918	/* Intergraph Application specific storage. */
+#define	TIFFTAG_INGR_FLAG_REGISTERS	33919	/* Intergraph Application specific flags. */
+#define	TIFFTAG_IRASB_TRANSORMATION_MATRIX	33920	/* Originally part of Intergraph's GeoTIFF tags, but likely understood by IrasB only. */
+#define	TIFFTAG_MODELTIEPOINTTAG	33922	/* GeoTIFF */
 /* 34016-34029 are reserved for ANSI IT8 TIFF/IT <dkelly@apago.com) */
 #define TIFFTAG_IT8SITE			34016	/* site name */
 #define TIFFTAG_IT8COLORSEQUENCE	34017	/* color seq. [RGB,CMYK,etc] */
@@ -432,6 +453,7 @@ typedef enum {
 #define TIFFTAG_IT8CMYKEQUIVALENT	34032	/* CMYK color equivalents */
 /* tags 34232-34236 are private tags registered to Texas Instruments */
 #define TIFFTAG_FRAMECOUNT              34232   /* Sequence Frame Count */
+#define TIFFTAG_MODELTRANSFORMATIONTAG	34264	/* Used in interchangeable GeoTIFF files */
 /* tag 34377 is private tag registered to Adobe for PhotoShop */
 #define TIFFTAG_PHOTOSHOP		34377 
 /* tags 34665, 34853 and 40965 are documented in EXIF specification */
@@ -451,7 +473,15 @@ typedef enum {
 #define TIFFTAG_STONITS			37439	/* Sample value to Nits */
 /* tag 34929 is a private tag registered to FedEx */
 #define	TIFFTAG_FEDEX_EDR		34929	/* unknown use */
+#define	TIFFTAG_IMAGESOURCEDATA		37724	/* http://justsolve.archiveteam.org/wiki/PSD, http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/ */
 #define TIFFTAG_INTEROPERABILITYIFD	40965	/* Pointer to Interoperability private directory */
+#define	TIFFTAG_GDAL_METADATA		42112	/* Used by the GDAL library */
+#define	TIFFTAG_GDAL_NODATA		42113	/* Used by the GDAL library */
+#define	TIFFTAG_OCE_SCANJOB_DESCRIPTION	50215	/* Used in the Oce scanning process */
+#define	TIFFTAG_OCE_APPLICATION_SELECTOR	50216	/* Used in the Oce scanning process. */
+#define	TIFFTAG_OCE_IDENTIFICATION_NUMBER	50217
+#define	TIFFTAG_OCE_IMAGELOGIC_CHARACTERISTICS	50218
+
 /* tags 50674 to 50677 are reserved for ESRI */
 #define TIFFTAG_LERC_PARAMETERS         50674   /* Stores LERC version and additional compression method */
 /* Adobe Digital Negative (DNG) format tags */
@@ -535,6 +565,17 @@ typedef enum {
 						   into ICC profile space */
 #define TIFFTAG_CURRENTICCPROFILE	50833	/* & */
 #define TIFFTAG_CURRENTPREPROFILEMATRIX	50834	/* & */
+
+#define TIFFTAG_RPCCOEFFICIENT          50844   /* Define by GDAL for geospatial georeferencing through RPC: http://geotiff.maptools.org/rpc_prop.html */
+
+#define	TIFFTAG_ALIAS_LAYER_METADATA	50784	/* Alias Sketchbook Pro layer usage description. */
+
+/* GeoTIFF DGIWG */
+#define TIFFTAG_TIFF_RSID               50908   /* https://www.awaresystems.be/imaging/tiff/tifftags/tiff_rsid.html */
+#define TIFFTAG_GEO_METADATA            50909   /* https://www.awaresystems.be/imaging/tiff/tifftags/geo_metadata.html */
+
+#define TIFFTAG_EXTRACAMERAPROFILES 50933  /* http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/products/photoshop/pdfs/dng_spec_1.4.0.0.pdf */
+
 /* tag 65535 is an undefined tag used by Eastman Kodak */
 #define TIFFTAG_DCSHUESHIFTVALUES       65535   /* hue shift correction data */
 
@@ -615,8 +656,11 @@ typedef enum {
 #define     LERC_ADD_COMPRESSION_DEFLATE 1
 #define     LERC_ADD_COMPRESSION_ZSTD    2
 #define TIFFTAG_LERC_MAXZERROR          65567    /* LERC maximum error */
-#define TIFFTAG_WEBP_LEVEL		  65568	/* WebP compression level: WARNING not registered in Adobe-maintained registry */
-#define TIFFTAG_WEBP_LOSSLESS		65569	/* WebP lossless/lossy : WARNING not registered in Adobe-maintained registry */
+#define TIFFTAG_WEBP_LEVEL		  65568	/* WebP compression level */
+#define TIFFTAG_WEBP_LOSSLESS		65569	/* WebP lossless/lossy */
+#define	TIFFTAG_DEFLATE_SUBCODEC	65570	/* ZIP codec: to get/set the sub-codec to use. Will default to libdeflate when available */
+#define     DEFLATE_SUBCODEC_ZLIB       0
+#define     DEFLATE_SUBCODEC_LIBDEFLATE 1
 
 /*
  * EXIF tags
@@ -626,8 +670,8 @@ typedef enum {
 #define EXIFTAG_EXPOSUREPROGRAM		34850	/* Exposure program */
 #define EXIFTAG_SPECTRALSENSITIVITY	34852	/* Spectral sensitivity */
 #define EXIFTAG_ISOSPEEDRATINGS		34855	/* ISO speed rating */
-#define EXIFTAG_OECF			34856	/* Optoelectric conversion
-						   factor */
+#define EXIFTAG_PHOTOGRAPHICSENSITIVITY	34855 /* Photographic Sensitivity (new name for tag 34855) */
+#define EXIFTAG_OECF			34856	/* Optoelectric conversion factor */
 #define EXIFTAG_EXIFVERSION		36864	/* Exif version */
 #define EXIFTAG_DATETIMEORIGINAL	36867	/* Date and time of original
 						   data generation */
@@ -679,10 +723,71 @@ typedef enum {
 #define EXIFTAG_SHARPNESS		41994	/* Sharpness */
 #define EXIFTAG_DEVICESETTINGDESCRIPTION 41995	/* Device settings description */
 #define EXIFTAG_SUBJECTDISTANCERANGE	41996	/* Subject distance range */
-#define EXIFTAG_GAINCONTROL		41991	/* Gain control */
-#define EXIFTAG_GAINCONTROL		41991	/* Gain control */
 #define EXIFTAG_IMAGEUNIQUEID		42016	/* Unique image ID */
 
+/*--: New for EXIF-Version 2.32, May 2019 ... */
+#define EXIFTAG_SENSITIVITYTYPE		34864		/* The SensitivityType tag indicates which one of the parameters of ISO12232 is the PhotographicSensitivity tag. */
+#define EXIFTAG_STANDARDOUTPUTSENSITIVITY		34865		/* This tag indicates the standard output sensitivity value of a camera or input device defined in ISO 12232. */
+#define EXIFTAG_RECOMMENDEDEXPOSUREINDEX		34866		/* recommended exposure index */
+#define EXIFTAG_ISOSPEED		34867		/* ISO speed value */
+#define EXIFTAG_ISOSPEEDLATITUDEYYY		34868		/* ISO speed latitude yyy */
+#define EXIFTAG_ISOSPEEDLATITUDEZZZ		34869		/* ISO speed latitude zzz */
+#define EXIFTAG_OFFSETTIME		36880		/* offset from UTC of the time of DateTime tag. */
+#define EXIFTAG_OFFSETTIMEORIGINAL		36881		/* offset from UTC of the time of DateTimeOriginal tag. */
+#define EXIFTAG_OFFSETTIMEDIGITIZED		36882		/* offset from UTC of the time of DateTimeDigitized tag. */
+#define EXIFTAG_TEMPERATURE		37888		/* Temperature as the ambient situation at the shot in dergee Celsius */
+#define EXIFTAG_HUMIDITY		37889		/* Humidity as the ambient situation at the shot in percent */
+#define EXIFTAG_PRESSURE		37890		/* Pressure as the ambient situation at the shot hecto-Pascal (hPa) */
+#define EXIFTAG_WATERDEPTH		37891		/* WaterDepth as the ambient situation at the shot in meter (m) */
+#define EXIFTAG_ACCELERATION		37892		/* Acceleration (a scalar regardless of direction) as the ambient situation at the shot in units of mGal (10-5 m/s^2) */
+#define EXIFTAG_CAMERAELEVATIONANGLE		37893		/* Elevation/depression. angle of the orientation of the camera(imaging optical axis) as the ambient situation at the shot in degree from -180deg to +180deg. */
+#define EXIFTAG_CAMERAOWNERNAME		42032		/* owner of a camera */
+#define EXIFTAG_BODYSERIALNUMBER		42033		/* serial number of the body of the camera */
+#define EXIFTAG_LENSSPECIFICATION		42034		/* minimum focal length (in mm), maximum focal length (in mm), minimum F number in the minimum focal length, and minimum F number in the maximum focal length, */
+#define EXIFTAG_LENSMAKE		42035		/* the lens manufacturer */
+#define EXIFTAG_LENSMODEL		42036		/* the lens model name and model number */
+#define EXIFTAG_LENSSERIALNUMBER		42037		/* the serial number of the interchangeable lens */
+#define EXIFTAG_GAMMA		42240		/* value of coefficient gamma */
+#define EXIFTAG_COMPOSITEIMAGE		42080	/* composite image */
+#define EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE		42081	/* source image number of composite image */
+#define EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE		42082	/* source exposure times of composite image */
+
+/*
+ * EXIF-GPS tags  (Version 2.31, July 2016)
+ */
+#define		GPSTAG_VERSIONID		0			/* 	Indicates the version of GPSInfoIFD.	 */
+#define		GPSTAG_LATITUDEREF		1			/* 	Indicates whether the latitude is north or south latitude.	 */
+#define		GPSTAG_LATITUDE		2			/* 	Indicates the latitude.	 */
+#define		GPSTAG_LONGITUDEREF		3			/* 	Indicates whether the longitude is east or west longitude.	 */
+#define		GPSTAG_LONGITUDE		4			/* 	Indicates the longitude.	 */
+#define		GPSTAG_ALTITUDEREF		5			/* 	Indicates the altitude used as the reference altitude.	 */
+#define		GPSTAG_ALTITUDE		6			/* 	Indicates the altitude based on the reference in GPSAltitudeRef.	 */
+#define		GPSTAG_TIMESTAMP		7			/* 	Indicates the time as UTC (Coordinated Universal Time).	 */
+#define		GPSTAG_SATELLITES		8			/* 	Indicates the GPS satellites used for measurements.	 */
+#define		GPSTAG_STATUS		9			/* 	Indicates the status of the GPS receiver when the image is recorded.	 */
+#define		GPSTAG_MEASUREMODE		10			/* 	Indicates the GPS measurement mode.	 */
+#define		GPSTAG_DOP		11			/* 	Indicates the GPS DOP (data degree of precision).	 */
+#define		GPSTAG_SPEEDREF		12			/* 	Indicates the unit used to express the GPS receiver speed of movement.	 */
+#define		GPSTAG_SPEED		13			/* 	Indicates the speed of GPS receiver movement.	 */
+#define		GPSTAG_TRACKREF		14			/* 	Indicates the reference for giving the direction of GPS receiver movement.	 */
+#define		GPSTAG_TRACK		15			/* 	Indicates the direction of GPS receiver movement.	 */
+#define		GPSTAG_IMGDIRECTIONREF		16			/* 	Indicates the reference for giving the direction of the image when it is captured.	 */
+#define		GPSTAG_IMGDIRECTION		17			/* 	Indicates the direction of the image when it was captured.	 */
+#define		GPSTAG_MAPDATUM		18			/* 	Indicates the geodetic survey data used by the GPS receiver. (e.g. WGS-84)	 */
+#define		GPSTAG_DESTLATITUDEREF		19			/* 	Indicates whether the latitude of the destination point is north or south latitude.	 */
+#define		GPSTAG_DESTLATITUDE		20			/* 	Indicates the latitude of the destination point.	 */
+#define		GPSTAG_DESTLONGITUDEREF		21			/* 	Indicates whether the longitude of the destination point is east or west longitude.	 */
+#define		GPSTAG_DESTLONGITUDE		22			/* 	Indicates the longitude of the destination point.	 */
+#define		GPSTAG_DESTBEARINGREF		23			/* 	Indicates the reference used for giving the bearing to the destination point.	 */
+#define		GPSTAG_DESTBEARING		24			/* 	Indicates the bearing to the destination point.	 */
+#define		GPSTAG_DESTDISTANCEREF		25			/* 	Indicates the unit used to express the distance to the destination point.	 */
+#define		GPSTAG_DESTDISTANCE		26			/* 	Indicates the distance to the destination point.	 */
+#define		GPSTAG_PROCESSINGMETHOD		27			/* 	A character string recording the name of the method used for location finding.	 */
+#define		GPSTAG_AREAINFORMATION		28			/* 	A character string recording the name of the GPS area.	 */
+#define		GPSTAG_DATESTAMP		29			/* 	A character string recording date and time information relative to UTC (Coordinated Universal Time).	 */
+#define		GPSTAG_DIFFERENTIAL		30			/* 	Indicates whether differential correction is applied to the GPS receiver.	 */
+#define		GPSTAG_GPSHPOSITIONINGERROR		31			/* Indicates horizontal positioning errors in meters.		 */
+
 #endif /* _TIFF_ */
 
 /* vim: set ts=8 sts=8 sw=8 noet: */
diff --git a/3rdparty/libtiff/tiffconf.h.cmake.in b/3rdparty/libtiff/tiffconf.h.cmake.in
index 59542f1e8c..9b4b032820 100644
--- a/3rdparty/libtiff/tiffconf.h.cmake.in
+++ b/3rdparty/libtiff/tiffconf.h.cmake.in
@@ -87,6 +87,9 @@
 /* Support Deflate compression */
 #cmakedefine ZIP_SUPPORT 1
 
+/* Support libdeflate enhanced compression */
+#cmakedefine LIBDEFLATE_SUPPORT 1
+
 /* Support strip chopping (whether or not to convert single-strip uncompressed
    images to mutiple strips of ~8Kb to reduce memory usage) */
 #cmakedefine STRIPCHOP_DEFAULT 1
diff --git a/3rdparty/libtiff/tiffio.h b/3rdparty/libtiff/tiffio.h
index 31c2e676e7..6274f0989e 100644
--- a/3rdparty/libtiff/tiffio.h
+++ b/3rdparty/libtiff/tiffio.h
@@ -261,8 +261,10 @@ typedef struct {
 #define LOGLUV_PUBLIC 1
 #endif
 
-#if !defined(__GNUC__) && !defined(__attribute__)
-#  define __attribute__(x) /*nothing*/
+#if defined(__GNUC__) || defined(__attribute__)
+#  define TIFF_ATTRIBUTE(x)    __attribute__(x)
+#else
+#  define TIFF_ATTRIBUTE(x) /*nothing*/
 #endif
 
 #if defined(c_plusplus) || defined(__cplusplus)
@@ -350,6 +352,7 @@ extern int TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap);
 extern int TIFFReadDirectory(TIFF* tif);
 extern int TIFFReadCustomDirectory(TIFF* tif, toff_t diroff, const TIFFFieldArray* infoarray);
 extern int TIFFReadEXIFDirectory(TIFF* tif, toff_t diroff);
+extern int TIFFReadGPSDirectory(TIFF* tif, toff_t diroff);
 extern uint64 TIFFScanlineSize64(TIFF* tif);
 extern tmsize_t TIFFScanlineSize(TIFF* tif);
 extern uint64 TIFFRasterScanlineSize64(TIFF* tif);
@@ -400,6 +403,7 @@ extern void TIFFFreeDirectory(TIFF*);
 extern int TIFFCreateDirectory(TIFF*);
 extern int TIFFCreateCustomDirectory(TIFF*,const TIFFFieldArray*);
 extern int TIFFCreateEXIFDirectory(TIFF*);
+extern int TIFFCreateGPSDirectory(TIFF*);
 extern int TIFFLastDirectory(TIFF*);
 extern int TIFFSetDirectory(TIFF*, uint16);
 extern int TIFFSetSubDirectory(TIFF*, uint64);
@@ -411,6 +415,8 @@ extern int TIFFWriteDirectory(TIFF *);
 extern int TIFFWriteCustomDirectory(TIFF *, uint64 *);
 extern int TIFFCheckpointDirectory(TIFF *);
 extern int TIFFRewriteDirectory(TIFF *);
+extern int TIFFDeferStrileArrayWriting(TIFF *);
+extern int TIFFForceStrileArrayWriting(TIFF* );
 
 #if defined(c_plusplus) || defined(__cplusplus)
 extern void TIFFPrintDirectory(TIFF*, FILE*, long = 0);
@@ -448,10 +454,10 @@ extern TIFF* TIFFClientOpen(const char*, const char*,
 	    TIFFMapFileProc, TIFFUnmapFileProc);
 extern const char* TIFFFileName(TIFF*);
 extern const char* TIFFSetFileName(TIFF*, const char *);
-extern void TIFFError(const char*, const char*, ...) __attribute__((__format__ (__printf__,2,3)));
-extern void TIFFErrorExt(thandle_t, const char*, const char*, ...) __attribute__((__format__ (__printf__,3,4)));
-extern void TIFFWarning(const char*, const char*, ...) __attribute__((__format__ (__printf__,2,3)));
-extern void TIFFWarningExt(thandle_t, const char*, const char*, ...) __attribute__((__format__ (__printf__,3,4)));
+extern void TIFFError(const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,2,3)));
+extern void TIFFErrorExt(thandle_t, const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,3,4)));
+extern void TIFFWarning(const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,2,3)));
+extern void TIFFWarningExt(thandle_t, const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,3,4)));
 extern TIFFErrorHandler TIFFSetErrorHandler(TIFFErrorHandler);
 extern TIFFErrorHandlerExt TIFFSetErrorHandlerExt(TIFFErrorHandlerExt);
 extern TIFFErrorHandler TIFFSetWarningHandler(TIFFErrorHandler);
@@ -468,6 +474,9 @@ extern tmsize_t TIFFReadEncodedStrip(TIFF* tif, uint32 strip, void* buf, tmsize_
 extern tmsize_t TIFFReadRawStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size);  
 extern tmsize_t TIFFReadEncodedTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size);  
 extern tmsize_t TIFFReadRawTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size);  
+extern int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
+                                       void* inbuf, tmsize_t insize,
+                                       void* outbuf, tmsize_t outsize);
 extern tmsize_t TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc);
 extern tmsize_t TIFFWriteRawStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc);  
 extern tmsize_t TIFFWriteEncodedTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc);  
@@ -488,6 +497,11 @@ extern void TIFFSwabArrayOfDouble(double* dp, tmsize_t n);
 extern void TIFFReverseBits(uint8* cp, tmsize_t n);
 extern const unsigned char* TIFFGetBitRevTable(int);
 
+extern uint64 TIFFGetStrileOffset(TIFF *tif, uint32 strile);
+extern uint64 TIFFGetStrileByteCount(TIFF *tif, uint32 strile);
+extern uint64 TIFFGetStrileOffsetWithErr(TIFF *tif, uint32 strile, int *pbErr);
+extern uint64 TIFFGetStrileByteCountWithErr(TIFF *tif, uint32 strile, int *pbErr);
+
 #ifdef LOGLUV_PUBLIC
 #define U_NEU		0.210526316
 #define V_NEU		0.473684211
diff --git a/3rdparty/libtiff/tiffiop.h b/3rdparty/libtiff/tiffiop.h
index 47a553aa54..39b54c8966 100644
--- a/3rdparty/libtiff/tiffiop.h
+++ b/3rdparty/libtiff/tiffiop.h
@@ -77,6 +77,19 @@ extern int snprintf(char* str, size_t size, const char* format, ...);
 #define	FALSE	0
 #endif
 
+#define TIFF_SIZE_T_MAX ((size_t) ~ ((size_t)0))
+#define TIFF_TMSIZE_T_MAX (tmsize_t)(TIFF_SIZE_T_MAX >> 1)
+
+/*
+ * Largest 32-bit unsigned integer value.
+ */
+#define TIFF_UINT32_MAX 0xFFFFFFFFU
+
+/*
+ * Largest 64-bit unsigned integer value.
+ */
+#define TIFF_UINT64_MAX (((uint64)(TIFF_UINT32_MAX)) << 32 | TIFF_UINT32_MAX)
+
 typedef struct client_info {
     struct client_info *next;
     void *data;
@@ -127,6 +140,9 @@ struct tiff {
         #define TIFF_DIRTYSTRIP 0x200000U /* stripoffsets/stripbytecount dirty*/
         #define TIFF_PERSAMPLE  0x400000U /* get/set per sample tags as arrays */
         #define TIFF_BUFFERMMAP 0x800000U /* read buffer (tif_rawdata) points into mmap() memory */
+        #define TIFF_DEFERSTRILELOAD 0x1000000U /* defer strip/tile offset/bytecount array loading. */
+        #define TIFF_LAZYSTRILELOAD  0x2000000U /* lazy/ondemand loading of strip/tile offset/bytecount values. Only used if TIFF_DEFERSTRILELOAD is set and in read-only mode */
+        #define TIFF_CHOPPEDUPARRAYS 0x4000000U /* set when allocChoppedUpStripArrays() has modified strip array */
 	uint64               tif_diroff;       /* file offset of current directory */
 	uint64               tif_nextdiroff;   /* file offset of following directory */
 	uint64*              tif_dirlist;      /* list of offsets to already seen directories to prevent IFD looping */
@@ -258,7 +274,7 @@ struct tiff {
 #define TIFFhowmany8_64(x) (((x)&0x07)?((uint64)(x)>>3)+1:(uint64)(x)>>3)
 #define TIFFroundup_64(x, y) (TIFFhowmany_64(x,y)*(y))
 
-/* Safe multiply which returns zero if there is an integer overflow */
+/* Safe multiply which returns zero if there is an *unsigned* integer overflow. This macro is not safe for *signed* integer types */
 #define TIFFSafeMultiply(t,v,m) ((((t)(m) != (t)0) && (((t)(((v)*(m))/(m))) == (t)(v))) ? (t)((v)*(m)) : (t)0)
 
 #define TIFFmax(A,B) ((A)>(B)?(A):(B))
@@ -351,6 +367,9 @@ extern uint32 _TIFFDefaultStripSize(TIFF* tif, uint32 s);
 extern void _TIFFDefaultTileSize(TIFF* tif, uint32* tw, uint32* th);
 extern int _TIFFDataSize(TIFFDataType type);
 
+/*--: Rational2Double: Return size of TIFFSetGetFieldType in bytes. */
+extern int _TIFFSetGetFieldSize(TIFFSetGetFieldType setgettype);
+
 extern void _TIFFsetByteArray(void**, void*, uint32);
 extern void _TIFFsetString(char**, char*);
 extern void _TIFFsetShortArray(uint16**, uint16*, uint32);
@@ -368,6 +387,8 @@ extern TIFFErrorHandlerExt _TIFFerrorHandlerExt;
 
 extern uint32 _TIFFMultiply32(TIFF*, uint32, uint32, const char*);
 extern uint64 _TIFFMultiply64(TIFF*, uint64, uint64, const char*);
+extern tmsize_t _TIFFMultiplySSize(TIFF*, tmsize_t, tmsize_t, const char*);
+extern tmsize_t _TIFFCastUInt64ToSSize(TIFF*, uint64, const char*);
 extern void* _TIFFCheckMalloc(TIFF*, tmsize_t, tmsize_t, const char*);
 extern void* _TIFFCheckRealloc(TIFF*, void*, tmsize_t, tmsize_t, const char*);
 
diff --git a/3rdparty/libtiff/tiffvers.h b/3rdparty/libtiff/tiffvers.h
index 403d61be04..0cce798b83 100644
--- a/3rdparty/libtiff/tiffvers.h
+++ b/3rdparty/libtiff/tiffvers.h
@@ -1,4 +1,4 @@
-#define TIFFLIB_VERSION_STR "LIBTIFF, Version 4.0.10\nCopyright (c) 1988-1996 Sam Leffler\nCopyright (c) 1991-1996 Silicon Graphics, Inc."
+#define TIFFLIB_VERSION_STR "LIBTIFF, Version 4.2.0\nCopyright (c) 1988-1996 Sam Leffler\nCopyright (c) 1991-1996 Silicon Graphics, Inc."
 /*
  * This define can be used in code that requires
  * compilation-related definitions specific to a
@@ -6,4 +6,4 @@
  * version checking should be done based on the
  * string returned by TIFFGetVersion.
  */
-#define TIFFLIB_VERSION 20181110
+#define TIFFLIB_VERSION 20201219
diff --git a/3rdparty/libwebp/src/dec/io_dec.c b/3rdparty/libwebp/src/dec/io_dec.c
index e603f19c98..29dc6345df 100644
--- a/3rdparty/libwebp/src/dec/io_dec.c
+++ b/3rdparty/libwebp/src/dec/io_dec.c
@@ -25,21 +25,16 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* output = p->output;
   const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   const int uv_w = (mb_w + 1) / 2;
   const int uv_h = (mb_h + 1) / 2;
-  int j;
-  for (j = 0; j < mb_h; ++j) {
-    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
-  }
-  for (j = 0; j < uv_h; ++j) {
-    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
-    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
-  }
+  WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+  WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+  WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
   return io->mb_h;
 }
 
@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* const output = p->output;
   WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPSamplerProcessPlane(io->y, io->y_stride,
                           io->u, io->v, io->uv_stride,
                           dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = io->mb_h;   // a priori guess
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
   const uint8_t* cur_y = io->y;
   const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
   int j;
   (void)expected_num_lines_out;
   assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
     uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
     const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
     const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
@@ -356,7 +351,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   const WebPYUV444Converter convert =
       WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
@@ -383,15 +378,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   while (j < mb_h) {
     const int y_lines_in =
         WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + j * io->y_stride, io->y_stride);
+                           io->y + (size_t)j * io->y_stride, io->y_stride);
     j += y_lines_in;
     if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
-      const int u_lines_in =
-          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
-                             io->u + uv_j * io->uv_stride, io->uv_stride);
-      const int v_lines_in =
-          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
-                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      const int u_lines_in = WebPRescalerImport(
+          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
+      const int v_lines_in = WebPRescalerImport(
+          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
       uv_j += u_lines_in;
@@ -403,7 +398,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int alpha_first =
       (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +426,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
@@ -470,7 +465,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
-      const int row_offset = scaler->src_y - io->mb_y;
+      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
       WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                          io->a + row_offset * io->width, io->width);
       lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
diff --git a/3rdparty/libwebp/src/dec/vp8_dec.c b/3rdparty/libwebp/src/dec/vp8_dec.c
index 57efb69041..8f73697478 100644
--- a/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -494,13 +494,11 @@ static int GetCoeffsAlt(VP8BitReader* const br,
   return 16;
 }
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
-  if (GetCoeffs == NULL) {
-    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
-      GetCoeffs = GetCoeffsAlt;
-    } else {
-      GetCoeffs = GetCoeffsFast;
-    }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+    GetCoeffs = GetCoeffsAlt;
+  } else {
+    GetCoeffs = GetCoeffsFast;
   }
 }
 
diff --git a/3rdparty/libwebp/src/dec/vp8i_dec.h b/3rdparty/libwebp/src/dec/vp8i_dec.h
index 600a684410..a0c0af1579 100644
--- a/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 1
+#define DEC_MIN_VERSION 2
 #define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
diff --git a/3rdparty/libwebp/src/dec/vp8l_dec.c b/3rdparty/libwebp/src/dec/vp8l_dec.c
index 93615d4ed2..2d603b4379 100644
--- a/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -947,7 +947,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
         break;
       default:
         goto Copy;
-        break;
     }
     CopySmallPattern8b(src, dst, length, pattern);
     return;
diff --git a/3rdparty/libwebp/src/demux/anim_decode.c b/3rdparty/libwebp/src/demux/anim_decode.c
index 05dd707371..3dcacc35d6 100644
--- a/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/3rdparty/libwebp/src/demux/anim_decode.c
@@ -346,12 +346,15 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
   {
     const uint8_t* in = iter.fragment.bytes;
     const size_t in_size = iter.fragment.size;
-    const size_t out_offset =
-        (iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
+    const uint32_t stride = width * NUM_CHANNELS;  // at most 25 + 2 bits
+    const uint64_t out_offset = (uint64_t)iter.y_offset * stride +
+                                (uint64_t)iter.x_offset * NUM_CHANNELS;  // 53b
+    const uint64_t size = (uint64_t)iter.height * stride;  // at most 25 + 27b
     WebPDecoderConfig* const config = &dec->config_;
     WebPRGBABuffer* const buf = &config->output.u.RGBA;
-    buf->stride = NUM_CHANNELS * width;
-    buf->size = buf->stride * iter.height;
+    if ((size_t)size != size) goto Error;
+    buf->stride = (int)stride;
+    buf->size = (size_t)size;
     buf->rgba = dec->curr_frame_ + out_offset;
 
     if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
diff --git a/3rdparty/libwebp/src/demux/demux.c b/3rdparty/libwebp/src/demux/demux.c
index 1b3cc2e0a8..860e2ce761 100644
--- a/3rdparty/libwebp/src/demux/demux.c
+++ b/3rdparty/libwebp/src/demux/demux.c
@@ -24,7 +24,7 @@
 #include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 1
+#define DMUX_MIN_VERSION 2
 #define DMUX_REV_VERSION 0
 
 typedef struct {
@@ -312,6 +312,7 @@ static ParseStatus ParseAnimationFrame(
   int bits;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
+  size_t start_offset;
   ParseStatus status =
       NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
   if (status != PARSE_OK) return status;
@@ -332,7 +333,11 @@ static ParseStatus ParseAnimationFrame(
 
   // Store a frame only if the animation flag is set there is some data for
   // this frame is available.
+  start_offset = mem->start_;
   status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+    status = PARSE_ERROR;
+  }
   if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing.c b/3rdparty/libwebp/src/dsp/alpha_processing.c
index 819d1391f2..3a27990ddc 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -359,6 +359,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+  int x;
+  for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
 //------------------------------------------------------------------------------
 // Simple channel manipulations.
 
@@ -400,6 +405,7 @@ void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 
 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -428,6 +434,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 
   WebPHasAlpha8b = HasAlpha8b_C;
   WebPHasAlpha32b = HasAlpha32b_C;
+  WebPAlphaReplace = AlphaReplace_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -469,4 +476,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
   assert(WebPPackRGB != NULL);
   assert(WebPHasAlpha8b != NULL);
   assert(WebPHasAlpha32b != NULL);
+  assert(WebPAlphaReplace != NULL);
 }
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 2871c56d84..f6c6e0fb1a 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -265,6 +265,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  for (; i + 8 <= length; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+    const __m128i b0 = _mm_srai_epi32(a0, 24);
+    const __m128i b1 = _mm_srai_epi32(a1, 24);
+    const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+    const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+    const __m128i d0 = _mm_and_si128(c0, m_color);
+    const __m128i d1 = _mm_and_si128(c1, m_color);
+    const __m128i e0 = _mm_andnot_si128(c0, a0);
+    const __m128i e1 = _mm_andnot_si128(c1, a1);
+    _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+    _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+  }
+  for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -334,6 +355,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
 
   WebPHasAlpha8b = HasAlpha8b_SSE2;
   WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPAlphaReplace = AlphaReplace_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/3rdparty/libwebp/src/dsp/cpu.c b/3rdparty/libwebp/src/dsp/cpu.c
index 0fa5b6a5ce..4ca90d88bf 100644
--- a/3rdparty/libwebp/src/dsp/cpu.c
+++ b/3rdparty/libwebp/src/dsp/cpu.c
@@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
     : "a"(info_type), "c"(0));
 }
-#elif (defined(_M_X64) || defined(_M_IX86)) && \
-      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
 #include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
-#elif defined(WEBP_MSC_SSE2)
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
 #define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
 #endif
 
 // NaCl has no support for xgetbv or the raw opcode.
@@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif
 
-#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
 
 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
@@ -179,6 +185,30 @@ static int AndroidCPUInfo(CPUFeature feature) {
   return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+  switch (feature) {
+#ifdef WEBP_USE_SSE2
+    case kSSE2:
+      return 1;
+#endif
+#ifdef WEBP_USE_SSE41
+    case kSSE3:
+    case kSlowSSSE3:
+    case kSSE4_1:
+      return 1;
+#endif
+#ifdef WEBP_USE_NEON
+    case kNEON:
+      return 1;
+#endif
+    default:
+      break;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
 #elif defined(WEBP_USE_NEON)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
diff --git a/3rdparty/libwebp/src/dsp/dec_neon.c b/3rdparty/libwebp/src/dsp/dec_neon.c
index 239ec4167e..fa851707e2 100644
--- a/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
-  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-  const uint16x8_t s0 = vaddq_u16(L0, L1);
-  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+  const uint16x8_t s0 = vaddl_u8(L0, L1);
+  const uint16x8_t s1 = vaddl_u8(L2, L3);
   const uint16x8_t s01 = vaddq_u16(s0, s1);
   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
@@ -1429,8 +1429,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   if (do_top) {
     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
 #if defined(__aarch64__)
-    const uint16x8_t B = vmovl_u8(A);
-    const uint16_t p2 = vaddvq_u16(B);
+    const uint16_t p2 = vaddlv_u8(A);
     sum_top = vdupq_n_u16(p2);
 #else
     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
@@ -1441,18 +1440,18 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   }
 
   if (do_left) {
-    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
-    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
-    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
-    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
-    const uint16x8_t s0 = vaddq_u16(L0, L1);
-    const uint16x8_t s1 = vaddq_u16(L2, L3);
-    const uint16x8_t s2 = vaddq_u16(L4, L5);
-    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+    const uint16x8_t s0 = vaddl_u8(L0, L1);
+    const uint16x8_t s1 = vaddl_u8(L2, L3);
+    const uint16x8_t s2 = vaddl_u8(L4, L5);
+    const uint16x8_t s3 = vaddl_u8(L6, L7);
     const uint16x8_t s01 = vaddq_u16(s0, s1);
     const uint16x8_t s23 = vaddq_u16(s2, s3);
     sum_left = vaddq_u16(s01, s23);
@@ -1512,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     const uint16x4_t p3 = vpadd_u16(p2, p2);
     sum_top = vcombine_u16(p3, p3);
+#endif
   }
 
   if (do_left) {
     int i;
     sum_left = vdupq_n_u16(0);
     for (i = 0; i < 16; i += 8) {
-      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
-      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
-      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
-      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
-      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
-      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
-      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
-      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
-      const uint16x8_t s0 = vaddq_u16(L0, L1);
-      const uint16x8_t s1 = vaddq_u16(L2, L3);
-      const uint16x8_t s2 = vaddq_u16(L4, L5);
-      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+      const uint16x8_t s0 = vaddl_u8(L0, L1);
+      const uint16x8_t s1 = vaddl_u8(L2, L3);
+      const uint16x8_t s2 = vaddl_u8(L4, L5);
+      const uint16x8_t s3 = vaddl_u8(L6, L7);
       const uint16x8_t s01 = vaddq_u16(s0, s1);
       const uint16x8_t s23 = vaddq_u16(s2, s3);
       const uint16x8_t sum = vaddq_u16(s01, s23);
diff --git a/3rdparty/libwebp/src/dsp/dsp.h b/3rdparty/libwebp/src/dsp/dsp.h
index a784de334a..298c721ae2 100644
--- a/3rdparty/libwebp/src/dsp/dsp.h
+++ b/3rdparty/libwebp/src/dsp/dsp.h
@@ -51,9 +51,7 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif
 
-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
+#if !defined(HAVE_CONFIG_H)
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -63,6 +61,7 @@ extern "C" {
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+#endif
 
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
@@ -76,6 +75,9 @@ extern "C" {
 #define WEBP_USE_SSE41
 #endif
 
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || \
@@ -110,8 +112,6 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif
 
-#endif  /* EMSCRIPTEN */
-
 #ifndef WEBP_DSP_OMIT_C_CODE
 #define WEBP_DSP_OMIT_C_CODE 1
 #endif
@@ -193,6 +193,12 @@ extern "C" {
 #endif
 #endif
 
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
 // Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
 #if !defined(WEBP_SWAP_16BIT_CSP)
 #define WEBP_SWAP_16BIT_CSP 0
@@ -632,6 +638,8 @@ extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 // This function returns true if src[4*i] contains a value different from 0xff.
 extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
diff --git a/3rdparty/libwebp/src/dsp/lossless.c b/3rdparty/libwebp/src/dsp/lossless.c
index aad5f43ec9..46b220e2ed 100644
--- a/3rdparty/libwebp/src/dsp/lossless.c
+++ b/3rdparty/libwebp/src/dsp/lossless.c
@@ -107,62 +107,62 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   return left;
 }
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
@@ -182,18 +182,18 @@ static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
     out[i] = left = VP8LAddPixels(in[i], left);
   }
 }
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
@@ -562,7 +562,6 @@ VP8LPredictorFunc VP8LPredictors[16];
 
 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];
 
 VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
@@ -600,8 +599,7 @@ extern void VP8LDspInitMSA(void);
 } while (0);
 
 WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
diff --git a/3rdparty/libwebp/src/dsp/lossless.h b/3rdparty/libwebp/src/dsp/lossless.h
index f709cc86b2..ebd316d1ed 100644
--- a/3rdparty/libwebp/src/dsp/lossless.h
+++ b/3rdparty/libwebp/src/dsp/lossless.h
@@ -30,7 +30,22 @@ extern "C" {
 
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
+
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                         const uint32_t* upper, int num_pixels,
diff --git a/3rdparty/libwebp/src/dsp/lossless_common.h b/3rdparty/libwebp/src/dsp/lossless_common.h
index 9c2ebe6809..96a106f9ee 100644
--- a/3rdparty/libwebp/src/dsp/lossless_common.h
+++ b/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -184,19 +184,6 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
   }                                                                  \
 }
 
-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  assert(upper != NULL);                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
-    out[x] = VP8LSubPixels(in[x], pred);                             \
-  }                                                                  \
-}
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc.c b/3rdparty/libwebp/src/dsp/lossless_enc.c
index 9c36055afc..a0c7ab9117 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -702,140 +702,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
 //------------------------------------------------------------------------------
 // Image transforms.
 
-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
-}
-
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
-  return Average2(Average2(a0, a2), a1);
-}
-
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
-                                     uint32_t a2, uint32_t a3) {
-  return Average2(Average2(a0, a1), Average2(a2, a3));
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
-                                         (c1 >> 16) & 0xff,
-                                         (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
-                                         (c1 >> 8) & 0xff,
-                                         (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
-  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && \
-    (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
-# define LOCAL_INLINE __attribute__ ((noinline))
-#else
-# define LOCAL_INLINE WEBP_INLINE
-#endif
-
-static LOCAL_INLINE int Sub3(int a, int b, int c) {
-  const int pb = b - c;
-  const int pa = a - c;
-  return abs(pb) - abs(pa);
-}
-
-#undef LOCAL_INLINE
-
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
-  const int pa_minus_pb =
-      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
-      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
-      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
-      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-//------------------------------------------------------------------------------
-// Predictors
-
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[0];
-}
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[1];
-}
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[-1];
-}
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
-  return pred;
-}
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[-1], top[0]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[0], top[1]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
-  return pred;
-}
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
-  return pred;
-}
-
-//------------------------------------------------------------------------------
-
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
                             int num_pixels, uint32_t* out) {
   int i;
@@ -850,18 +716,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
   (void)upper;
 }
 
-GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
-GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
-GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
-GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
-GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
-GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
-GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
-GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
-GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
-GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
-GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
-GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
+                                          const uint32_t* upper,           \
+                                          int num_pixels, uint32_t* out) { \
+  int x;                                                                   \
+  assert(upper != NULL);                                                   \
+  for (x = 0; x < num_pixels; ++x) {                                       \
+    const uint32_t pred =                                                  \
+        VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x);              \
+    out[x] = VP8LSubPixels(in[x], pred);                                   \
+  }                                                                        \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)
 
 //------------------------------------------------------------------------------
 
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index e676f6fdc9..90c263735f 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -249,6 +249,7 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
     }                                  \
   } while (0)
 
+#if !(defined(__i386__) || defined(_M_IX86))
 static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
@@ -300,6 +301,8 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
   return (float)retval;
 }
+#endif  // !(defined(__i386__) || defined(_M_IX86))
+
 #undef ANALYZE_X_OR_Y
 #undef ANALYZE_XY
 
@@ -460,20 +463,22 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
   (void)upper;
 }
 
-#define GENERATE_PREDICTOR_1(X, IN)                                           \
-static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
-  int i;                                                                      \
-  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
-    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
-    const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));              \
-    const __m128i res = _mm_sub_epi8(src, pred);                              \
-    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
-  }                                                                           \
-  if (i != num_pixels) {                                                      \
-    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
-  }                                                                           \
-}
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
+                                     const uint32_t* const upper,           \
+                                     int num_pixels, uint32_t* const out) { \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));          \
+      const __m128i res = _mm_sub_epi8(src, pred);                          \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i),           \
+                               num_pixels - i, out + i);                    \
+    }                                                                       \
+  }
 
 GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
 GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
@@ -657,7 +662,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
   VP8LAddVector = AddVector_SSE2;
   VP8LAddVectorEq = AddVectorEq_SSE2;
+  // TODO(https://crbug.com/webp/499): this function produces different results
+  // from the C code due to use of double/float resulting in output differences
+  // when compared to -noasm.
+#if !(defined(__i386__) || defined(_M_IX86))
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
   VP8LVectorMismatch = VectorMismatch_SSE2;
   VP8LBundleColorMap = BundleColorMap_SSE2;
 
diff --git a/3rdparty/libwebp/src/enc/analysis_enc.c b/3rdparty/libwebp/src/enc/analysis_enc.c
index 687757ae03..ebb784261c 100644
--- a/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -126,16 +126,6 @@ static void InitHistogram(VP8Histogram* const histo) {
   histo->last_non_zero = 1;
 }
 
-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  if (in->max_value > out->max_value) {
-    out->max_value = in->max_value;
-  }
-  if (in->last_non_zero > out->last_non_zero) {
-    out->last_non_zero = in->last_non_zero;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
@@ -285,49 +275,6 @@ static int FastMBAnalyze(VP8EncIterator* const it) {
   return 0;
 }
 
-static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
-                                   int best_alpha) {
-  uint8_t modes[16];
-  const int max_mode = MAX_INTRA4_MODE;
-  int i4_alpha;
-  VP8Histogram total_histo;
-  int cur_histo = 0;
-  InitHistogram(&total_histo);
-
-  VP8IteratorStartI4(it);
-  do {
-    int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
-    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
-
-    VP8MakeIntra4Preds(it);
-    for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      InitHistogram(&histos[cur_histo]);
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
-        best_mode_alpha = alpha;
-        modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
-      }
-    }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
-    // Note: we reuse the original samples for predictors
-  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
-
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
-    VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
-  }
-  return best_alpha;
-}
-
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
   int smallest_alpha = 0;
@@ -371,13 +318,6 @@ static void MBAnalyze(VP8EncIterator* const it,
     best_alpha = FastMBAnalyze(it);
   } else {
     best_alpha = MBAnalyzeBestIntra16Mode(it);
-    if (enc->method_ >= 5) {
-      // We go and make a fast decision for intra4/intra16.
-      // It's usually not a good and definitive pick, but helps seeding the
-      // stats about level bit-cost.
-      // TODO(skal): improve criterion.
-      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
-    }
   }
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
diff --git a/3rdparty/libwebp/src/enc/backward_references_enc.c b/3rdparty/libwebp/src/enc/backward_references_enc.c
index d445b40fc5..519b36a091 100644
--- a/3rdparty/libwebp/src/enc/backward_references_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -11,13 +11,14 @@
 //
 
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 
-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
+#include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/dsp/dsp.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
 
@@ -103,6 +104,20 @@ void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
   }
 }
 
+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+                             VP8LBackwardRefs* const refs2) {
+  const int point_to_refs1 =
+      (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+  const int point_to_refs2 =
+      (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+  const VP8LBackwardRefs tmp = *refs1;
+  *refs1 = *refs2;
+  *refs2 = tmp;
+  if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+  if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
   assert(refs != NULL);
   memset(refs, 0, sizeof(*refs));
@@ -154,6 +169,22 @@ static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
   return b;
 }
 
+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+                             VP8LBackwardRefs* const to) {
+  const PixOrCopyBlock* block_from = from->refs_;
+  VP8LClearBackwardRefs(to);
+  while (block_from != NULL) {
+    PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+    if (block_to == NULL) return 0;
+    memcpy(block_to->start_, block_from->start_,
+           block_from->size_ * sizeof(PixOrCopy));
+    block_to->size_ = block_from->size_;
+    block_from = block_from->next_;
+  }
+  return 1;
+}
+
 extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                       const PixOrCopy v);
 void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
@@ -753,12 +784,18 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
         }
       }
     } else {
+      int code, extra_bits, extra_bits_value;
       // We should compute the contribution of the (distance,length)
       // histograms but those are the same independently from the cache size.
       // As those constant contributions are in the end added to the other
-      // histogram contributions, we can safely ignore them.
+      // histogram contributions, we can ignore them, except for the length
+      // prefix that is part of the literal_ histogram.
       int len = PixOrCopyLength(v);
       uint32_t argb_prev = *argb ^ 0xffffffffu;
+      VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+      for (i = 0; i <= cache_bits_max; ++i) {
+        ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+      }
       // Update the color caches.
       do {
         if (*argb != argb_prev) {
@@ -842,16 +879,21 @@ extern int VP8LBackwardReferencesTraceBackwards(
     int xsize, int ysize, const uint32_t* const argb, int cache_bits,
     const VP8LHashChain* const hash_chain,
     const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
-static VP8LBackwardRefs* GetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
-    VP8LBackwardRefs* worst) {
-  const int cache_bits_initial = *cache_bits;
-  double bit_cost_best = -1;
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb, int quality,
+                                 int lz77_types_to_try, int cache_bits_max,
+                                 int do_no_cache,
+                                 const VP8LHashChain* const hash_chain,
+                                 VP8LBackwardRefs* const refs,
+                                 int* const cache_bits_best) {
   VP8LHistogram* histo = NULL;
-  int lz77_type, lz77_type_best = 0;
+  int i, lz77_type;
+  // Index 0 is for a color cache, index 1 for no cache (if needed).
+  int lz77_types_best[2] = {0, 0};
+  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
   VP8LHashChain hash_chain_box;
+  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+  int status = 0;
   memset(&hash_chain_box, 0, sizeof(hash_chain_box));
 
   histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
@@ -860,86 +902,129 @@ static VP8LBackwardRefs* GetBackwardReferences(
   for (lz77_type = 1; lz77_types_to_try;
        lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
     int res = 0;
-    double bit_cost;
-    int cache_bits_tmp = cache_bits_initial;
+    double bit_cost = 0.;
     if ((lz77_types_to_try & lz77_type) == 0) continue;
     switch (lz77_type) {
       case kLZ77RLE:
-        res = BackwardReferencesRle(width, height, argb, 0, worst);
+        res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
         break;
       case kLZ77Standard:
         // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
         // cache is not that different in practice.
-        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+                                     refs_tmp);
         break;
       case kLZ77Box:
         if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
         res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
-                                        &hash_chain_box, worst);
+                                        &hash_chain_box, refs_tmp);
         break;
       default:
         assert(0);
     }
     if (!res) goto Error;
 
-    // Next, try with a color cache and update the references.
-    if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
-      goto Error;
-    }
-    if (cache_bits_tmp > 0) {
-      if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
-        goto Error;
+    // Start with the no color cache case.
+    for (i = 1; i >= 0; --i) {
+      int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+      if (i == 1 && !do_no_cache) continue;
+
+      if (i == 0) {
+        // Try with a color cache.
+        if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+          goto Error;
+        }
+        if (cache_bits > 0) {
+          if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+            goto Error;
+          }
+        }
+      }
+
+      if (i == 0 && do_no_cache && cache_bits == 0) {
+        // No need to re-compute bit_cost as it was computed at i == 1.
+      } else {
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost = VP8LHistogramEstimateBits(histo);
+      }
+
+      if (bit_cost < bit_costs_best[i]) {
+        if (i == 1) {
+          // Do not swap as the full cache analysis would have the wrong
+          // VP8LBackwardRefs to start with.
+          if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+        } else {
+          BackwardRefsSwap(refs_tmp, &refs[0]);
+        }
+        bit_costs_best[i] = bit_cost;
+        lz77_types_best[i] = lz77_type;
+        if (i == 0) *cache_bits_best = cache_bits;
       }
     }
-
-    // Keep the best backward references.
-    VP8LHistogramCreate(histo, worst, cache_bits_tmp);
-    bit_cost = VP8LHistogramEstimateBits(histo);
-    if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
-      VP8LBackwardRefs* const tmp = worst;
-      worst = best;
-      best = tmp;
-      bit_cost_best = bit_cost;
-      *cache_bits = cache_bits_tmp;
-      lz77_type_best = lz77_type;
-    }
   }
-  assert(lz77_type_best > 0);
+  assert(lz77_types_best[0] > 0);
+  assert(!do_no_cache || lz77_types_best[1] > 0);
 
   // Improve on simple LZ77 but only for high quality (TraceBackwards is
   // costly).
-  if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
-      quality >= 25) {
-    const VP8LHashChain* const hash_chain_tmp =
-        (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
-    if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
-                                             hash_chain_tmp, best, worst)) {
-      double bit_cost_trace;
-      VP8LHistogramCreate(histo, worst, *cache_bits);
-      bit_cost_trace = VP8LHistogramEstimateBits(histo);
-      if (bit_cost_trace < bit_cost_best) best = worst;
+  for (i = 1; i >= 0; --i) {
+    if (i == 1 && !do_no_cache) continue;
+    if ((lz77_types_best[i] == kLZ77Standard ||
+         lz77_types_best[i] == kLZ77Box) &&
+        quality >= 25) {
+      const VP8LHashChain* const hash_chain_tmp =
+          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                               hash_chain_tmp, &refs[i],
+                                               refs_tmp)) {
+        double bit_cost_trace;
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_costs_best[i]) {
+          BackwardRefsSwap(refs_tmp, &refs[i]);
+        }
+      }
+    }
+
+    BackwardReferences2DLocality(width, &refs[i]);
+
+    if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+        *cache_bits_best == 0) {
+      // If the best cache size is 0 and we have the same best LZ77, just copy
+      // the data over and stop here.
+      if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+      break;
     }
   }
-
-  BackwardReferences2DLocality(width, best);
+  status = 1;
 
 Error:
   VP8LHashChainClear(&hash_chain_box);
   VP8LFreeHistogram(histo);
-  return best;
+  return status;
 }
 
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+WebPEncodingError VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2) {
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best) {
   if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
-                                          hash_chain, refs_tmp1);
+    VP8LBackwardRefs* refs_best;
+    *cache_bits_best = cache_bits_max;
+    refs_best = GetBackwardReferencesLowEffort(
+        width, height, argb, cache_bits_best, hash_chain, refs);
+    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    // Set it in first position.
+    BackwardRefsSwap(refs_best, &refs[0]);
   } else {
-    return GetBackwardReferences(width, height, argb, quality,
-                                 lz77_types_to_try, cache_bits, hash_chain,
-                                 refs_tmp1, refs_tmp2);
+    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+                               cache_bits_max, do_no_cache, hash_chain, refs,
+                               cache_bits_best)) {
+      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    }
   }
+  return VP8_ENC_OK;
 }
diff --git a/3rdparty/libwebp/src/enc/backward_references_enc.h b/3rdparty/libwebp/src/enc/backward_references_enc.h
index 103ddfdcb7..4c0267b41e 100644
--- a/3rdparty/libwebp/src/enc/backward_references_enc.h
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "src/webp/types.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
@@ -218,14 +219,19 @@ enum VP8LLZ77Type {
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+WebPEncodingError VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best);
 
 #ifdef __cplusplus
 }
diff --git a/3rdparty/libwebp/src/enc/config_enc.c b/3rdparty/libwebp/src/enc/config_enc.c
index 9d4828978e..3518b41403 100644
--- a/3rdparty/libwebp/src/enc/config_enc.c
+++ b/3rdparty/libwebp/src/enc/config_enc.c
@@ -39,6 +39,8 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->partitions = 0;
   config->segments = 4;
   config->pass = 1;
+  config->qmin = 0;
+  config->qmax = 100;
   config->show_compressed = 0;
   config->preprocessing = 0;
   config->autofilter = 0;
@@ -106,6 +108,9 @@ int WebPValidateConfig(const WebPConfig* config) {
   if (config->filter_type < 0 || config->filter_type > 1) return 0;
   if (config->autofilter < 0 || config->autofilter > 1) return 0;
   if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+    return 0;
+  }
   if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
   if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
   if (config->partitions < 0 || config->partitions > 3) return 0;
diff --git a/3rdparty/libwebp/src/enc/frame_enc.c b/3rdparty/libwebp/src/enc/frame_enc.c
index 1aec376e44..af538d83ba 100644
--- a/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/3rdparty/libwebp/src/enc/frame_enc.c
@@ -31,10 +31,15 @@
 // we allow 2k of extra head-room in PARTITION0 limit.
 #define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
 
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
 typedef struct {  // struct for organizing convergence in either size or PSNR
   int is_first;
   float dq;
   float q, last_q;
+  float qmin, qmax;
   double value, last_value;   // PSNR or size
   double target;
   int do_size_search;
@@ -47,7 +52,9 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
 
   s->is_first = 1;
   s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
+  s->qmin = 1.f * enc->config_->qmin;
+  s->qmax = 1.f * enc->config_->qmax;
+  s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
   s->target = do_size_search ? (double)target_size
             : (target_PSNR > 0.) ? target_PSNR
             : 40.;   // default, just in case
@@ -56,10 +63,6 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
   return do_size_search;
 }
 
-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
 static float ComputeNextQ(PassStats* const s) {
   float dq;
   if (s->is_first) {
@@ -75,7 +78,7 @@ static float ComputeNextQ(PassStats* const s) {
   s->dq = Clamp(dq, -30.f, 30.f);
   s->last_q = s->q;
   s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
+  s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
   return s->q;
 }
 
@@ -848,9 +851,10 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     }
 
 #if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf "
+           " range:[%.1f, %.1f]\n",
            num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
+           stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
 #endif
     if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
diff --git a/3rdparty/libwebp/src/enc/histogram_enc.c b/3rdparty/libwebp/src/enc/histogram_enc.c
index a4e6bf3a98..edc6e4faa4 100644
--- a/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -208,6 +208,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
   } else if (PixOrCopyIsCacheIdx(v)) {
     const int literal_ix =
         NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    assert(histo->palette_code_bits_ != 0);
     ++histo->literal_[literal_ix];
   } else {
     int code, extra_bits;
diff --git a/3rdparty/libwebp/src/enc/picture_csp_enc.c b/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 718e014ed2..35eede9635 100644
--- a/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -61,16 +61,14 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
 // Checking for the presence of non-opaque alpha.
 int WebPPictureHasTransparency(const WebPPicture* picture) {
   if (picture == NULL) return 0;
-  if (!picture->use_argb) {
-    return CheckNonOpaque(picture->a, picture->width, picture->height,
-                          1, picture->a_stride);
-  } else {
+  if (picture->use_argb) {
     const int alpha_offset = ALPHA_OFFSET;
     return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
                           picture->width, picture->height,
                           4, picture->argb_stride * sizeof(*picture->argb));
   }
-  return 0;
+  return CheckNonOpaque(picture->a, picture->width, picture->height,
+                        1, picture->a_stride);
 }
 
 //------------------------------------------------------------------------------
@@ -90,8 +88,9 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 static int kLinearToGammaTab[kGammaTabSize + 1];
 static uint16_t kGammaToLinearTab[256];
 static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
   if (!kGammaTablesOk) {
     int v;
     const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
@@ -181,8 +180,9 @@ static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
 #define GAMMA_TO_LINEAR_BITS 14
 static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
 static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
   assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
   if (!kGammaTablesSOk) {
     int v;
diff --git a/3rdparty/libwebp/src/enc/picture_tools_enc.c b/3rdparty/libwebp/src/enc/picture_tools_enc.c
index d0e8a495da..38cb01534a 100644
--- a/3rdparty/libwebp/src/enc/picture_tools_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -83,6 +83,19 @@ static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
   return (count == 0);
 }
 
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+  if (pic != NULL && pic->use_argb) {
+    int y = pic->height;
+    uint32_t* argb = pic->argb;
+    color &= 0xffffffu;   // force alpha=0
+    WebPInitAlphaProcessing();
+    while (y-- > 0) {
+      WebPAlphaReplace(argb, pic->width, color);
+      argb += pic->argb_stride;
+    }
+  }
+}
+
 void WebPCleanupTransparentArea(WebPPicture* pic) {
   int x, y, w, h;
   if (pic == NULL) return;
@@ -165,24 +178,6 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
-  int x, y, w, h;
-  uint32_t* argb;
-  assert(pic != NULL && pic->use_argb);
-  w = pic->width;
-  h = pic->height;
-  argb = pic->argb;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      if ((argb[x] & 0xff000000) == 0) {
-        argb[x] = 0x00000000;
-      }
-    }
-    argb += pic->argb_stride;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info
 
diff --git a/3rdparty/libwebp/src/enc/vp8i_enc.h b/3rdparty/libwebp/src/enc/vp8i_enc.h
index fedcaeea27..0e35562a8c 100644
--- a/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 1
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -505,9 +505,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);
 
 //------------------------------------------------------------------------------
 
diff --git a/3rdparty/libwebp/src/enc/vp8l_enc.c b/3rdparty/libwebp/src/enc/vp8l_enc.c
index 2efd403f77..0b44ebe46e 100644
--- a/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -144,7 +144,8 @@ typedef enum {
   kSubGreen = 2,
   kSpatialSubGreen = 3,
   kPalette = 4,
-  kNumEntropyIx = 5
+  kPaletteAndSpatial = 5,
+  kNumEntropyIx = 6
 } EntropyIx;
 
 typedef enum {
@@ -354,11 +355,15 @@ static int GetTransformBits(int method, int histo_bits) {
 }
 
 // Set of parameters to be used in each iteration of the cruncher.
-#define CRUNCH_CONFIGS_LZ77_MAX 2
+#define CRUNCH_SUBCONFIGS_MAX 2
+typedef struct {
+  int lz77_;
+  int do_no_cache_;
+} CrunchSubConfig;
 typedef struct {
   int entropy_idx_;
-  int lz77s_types_to_try_[CRUNCH_CONFIGS_LZ77_MAX];
-  int lz77s_types_to_try_size_;
+  CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
+  int sub_configs_size_;
 } CrunchConfig;
 
 #define CRUNCH_CONFIGS_MAX kNumEntropyIx
@@ -376,6 +381,9 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   int i;
   int use_palette;
   int n_lz77s;
+  // If set to 0, analyze the cache with the computed cache value. If 1, also
+  // analyze with no-cache.
+  int do_no_cache = 0;
   assert(pic != NULL && pic->argb != NULL);
 
   use_palette =
@@ -402,10 +410,13 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
       return 0;
     }
     if (method == 6 && config->quality == 100) {
+      do_no_cache = 1;
       // Go brute force on all transforms.
       *crunch_configs_size = 0;
       for (i = 0; i < kNumEntropyIx; ++i) {
-        if (i != kPalette || use_palette) {
+        // We can only apply kPalette or kPaletteAndSpatial if we can indeed use
+        // a palette.
+        if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
           assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
           crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i;
         }
@@ -414,17 +425,28 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
       // Only choose the guessed best transform.
       *crunch_configs_size = 1;
       crunch_configs[0].entropy_idx_ = min_entropy_ix;
+      if (config->quality >= 75 && method == 5) {
+        // Test with and without color cache.
+        do_no_cache = 1;
+        // If we have a palette, also check in combination with spatial.
+        if (min_entropy_ix == kPalette) {
+          *crunch_configs_size = 2;
+          crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+        }
+      }
     }
   }
   // Fill in the different LZ77s.
-  assert(n_lz77s <= CRUNCH_CONFIGS_LZ77_MAX);
+  assert(n_lz77s <= CRUNCH_SUBCONFIGS_MAX);
   for (i = 0; i < *crunch_configs_size; ++i) {
     int j;
     for (j = 0; j < n_lz77s; ++j) {
-      crunch_configs[i].lz77s_types_to_try_[j] =
+      assert(j < CRUNCH_SUBCONFIGS_MAX);
+      crunch_configs[i].sub_configs_[j].lz77_ =
           (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+      crunch_configs[i].sub_configs_[j].do_no_cache_ = do_no_cache;
     }
-    crunch_configs[i].lz77s_types_to_try_size_ = n_lz77s;
+    crunch_configs[i].sub_configs_size_ = n_lz77s;
   }
   return 1;
 }
@@ -440,7 +462,7 @@ static int EncoderInit(VP8LEncoder* const enc) {
   int i;
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
-  for (i = 0; i < 3; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
+  for (i = 0; i < 4; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
 
   return 1;
 }
@@ -769,13 +791,10 @@ static WebPEncodingError StoreImageToBitMask(
 }
 
 // Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
-static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
-                                              const uint32_t* const argb,
-                                              VP8LHashChain* const hash_chain,
-                                              VP8LBackwardRefs* const refs_tmp1,
-                                              VP8LBackwardRefs* const refs_tmp2,
-                                              int width, int height,
-                                              int quality, int low_effort) {
+static WebPEncodingError EncodeImageNoHuffman(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_array,
+    int width, int height, int quality, int low_effort) {
   int i;
   int max_tokens = 0;
   WebPEncodingError err = VP8_ENC_OK;
@@ -798,13 +817,11 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0,
-                                   kLZ77Standard | kLZ77RLE, &cache_bits,
-                                   hash_chain, refs_tmp1, refs_tmp2);
-  if (refs == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
+  err = VP8LGetBackwardReferences(
+      width, height, argb, quality, /*low_effort=*/0, kLZ77Standard | kLZ77RLE,
+      cache_bits, /*do_no_cache=*/0, hash_chain, refs_array, &cache_bits);
+  if (err != VP8_ENC_OK) goto Error;
+  refs = &refs_array[0];
   histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
   if (histogram_image == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -860,11 +877,11 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
 
 static WebPEncodingError EncodeImageInternal(
     VP8LBitWriter* const bw, const uint32_t* const argb,
-    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[3], int width,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
     int height, int quality, int low_effort, int use_cache,
     const CrunchConfig* const config, int* cache_bits, int histogram_bits,
     size_t init_byte_position, int* const hdr_size, int* const data_size) {
-  WebPEncodingError err = VP8_ENC_OK;
+  WebPEncodingError err = VP8_ENC_ERROR_OUT_OF_MEMORY;
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
@@ -876,103 +893,103 @@ static WebPEncodingError EncodeImageInternal(
       3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  VP8LBackwardRefs* refs_best;
-  VP8LBackwardRefs* refs_tmp;
   uint16_t* const histogram_symbols =
       (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
                                 sizeof(*histogram_symbols));
-  int lz77s_idx;
+  int sub_configs_idx;
+  int cache_bits_init, write_histogram_image;
   VP8LBitWriter bw_init = *bw, bw_best;
   int hdr_size_tmp;
+  VP8LHashChain hash_chain_histogram;  // histogram image hash chain
+  size_t bw_size_best = ~(size_t)0;
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
   assert(hdr_size != NULL);
   assert(data_size != NULL);
 
-  if (histogram_symbols == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  // Make sure we can allocate the different objects.
+  memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+  if (huff_tree == NULL || histogram_symbols == NULL ||
+      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize) ||
+      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     goto Error;
   }
-
   if (use_cache) {
     // If the value is different from zero, it has been set during the
     // palette analysis.
-    if (*cache_bits == 0) *cache_bits = MAX_COLOR_CACHE_BITS;
+    cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
   } else {
-    *cache_bits = 0;
+    cache_bits_init = 0;
   }
-  // 'best_refs' is the reference to the best backward refs and points to one
-  // of refs_array[0] or refs_array[1].
-  // Calculate backward references from ARGB image.
-  if (huff_tree == NULL ||
-      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort) ||
-      !VP8LBitWriterInit(&bw_best, 0) ||
-      (config->lz77s_types_to_try_size_ > 1 &&
+  // If several iterations will happen, clone into bw_best.
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      ((config->sub_configs_size_ > 1 ||
+        config->sub_configs_[0].do_no_cache_) &&
        !VP8LBitWriterClone(bw, &bw_best))) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
-  for (lz77s_idx = 0; lz77s_idx < config->lz77s_types_to_try_size_;
-       ++lz77s_idx) {
-    refs_best = VP8LGetBackwardReferences(
-        width, height, argb, quality, low_effort,
-        config->lz77s_types_to_try_[lz77s_idx], cache_bits, hash_chain,
-        &refs_array[0], &refs_array[1]);
-    if (refs_best == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Keep the best references aside and use the other element from the first
-    // two as a temporary for later usage.
-    refs_tmp = &refs_array[refs_best == &refs_array[0] ? 1 : 0];
+  for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
+       ++sub_configs_idx) {
+    const CrunchSubConfig* const sub_config =
+        &config->sub_configs_[sub_configs_idx];
+    int cache_bits_best, i_cache;
+    err = VP8LGetBackwardReferences(width, height, argb, quality, low_effort,
+                                    sub_config->lz77_, cache_bits_init,
+                                    sub_config->do_no_cache_, hash_chain,
+                                    &refs_array[0], &cache_bits_best);
+    if (err != VP8_ENC_OK) goto Error;
 
-    histogram_image =
-        VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
-    tmp_histo = VP8LAllocateHistogram(*cache_bits);
-    if (histogram_image == NULL || tmp_histo == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
+    for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
+      const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+      // Speed-up: no need to study the no-cache case if it was already studied
+      // in i_cache == 0.
+      if (i_cache == 1 && cache_bits_best == 0) break;
 
-    // Build histogram image and symbols from backward references.
-    if (!VP8LGetHistoImageSymbols(width, height, refs_best, quality, low_effort,
-                                  histogram_bits, *cache_bits, histogram_image,
-                                  tmp_histo, histogram_symbols)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Create Huffman bit lengths and codes for each histogram image.
-    histogram_image_size = histogram_image->size;
-    bit_array_size = 5 * histogram_image_size;
-    huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
-                                                     sizeof(*huffman_codes));
-    // Note: some histogram_image entries may point to tmp_histos[], so the
-    // latter need to outlive the following call to GetHuffBitLengthsAndCodes().
-    if (huffman_codes == NULL ||
-        !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Free combined histograms.
-    VP8LFreeHistogramSet(histogram_image);
-    histogram_image = NULL;
+      // Reset the bit writer for this iteration.
+      VP8LBitWriterReset(&bw_init, bw);
 
-    // Free scratch histograms.
-    VP8LFreeHistogram(tmp_histo);
-    tmp_histo = NULL;
+      // Build histogram image and symbols from backward references.
+      histogram_image =
+          VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
+      tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
+      if (histogram_image == NULL || tmp_histo == NULL ||
+          !VP8LGetHistoImageSymbols(width, height, &refs_array[i_cache],
+                                    quality, low_effort, histogram_bits,
+                                    cache_bits_tmp, histogram_image, tmp_histo,
+                                    histogram_symbols)) {
+        goto Error;
+      }
+      // Create Huffman bit lengths and codes for each histogram image.
+      histogram_image_size = histogram_image->size;
+      bit_array_size = 5 * histogram_image_size;
+      huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                       sizeof(*huffman_codes));
+      // Note: some histogram_image entries may point to tmp_histos[], so the
+      // latter need to outlive the following call to
+      // GetHuffBitLengthsAndCodes().
+      if (huffman_codes == NULL ||
+          !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+        goto Error;
+      }
+      // Free combined histograms.
+      VP8LFreeHistogramSet(histogram_image);
+      histogram_image = NULL;
 
-    // Color Cache parameters.
-    if (*cache_bits > 0) {
-      VP8LPutBits(bw, 1, 1);
-      VP8LPutBits(bw, *cache_bits, 4);
-    } else {
-      VP8LPutBits(bw, 0, 1);
-    }
+      // Free scratch histograms.
+      VP8LFreeHistogram(tmp_histo);
+      tmp_histo = NULL;
 
-    // Huffman image + meta huffman.
-    {
-      const int write_histogram_image = (histogram_image_size > 1);
+      // Color Cache parameters.
+      if (cache_bits_tmp > 0) {
+        VP8LPutBits(bw, 1, 1);
+        VP8LPutBits(bw, cache_bits_tmp, 4);
+      } else {
+        VP8LPutBits(bw, 0, 1);
+      }
+
+      // Huffman image + meta huffman.
+      write_histogram_image = (histogram_image_size > 1);
       VP8LPutBits(bw, write_histogram_image, 1);
       if (write_histogram_image) {
         uint32_t* const histogram_argb =
@@ -980,10 +997,7 @@ static WebPEncodingError EncodeImageInternal(
                                       sizeof(*histogram_argb));
         int max_index = 0;
         uint32_t i;
-        if (histogram_argb == NULL) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-          goto Error;
-        }
+        if (histogram_argb == NULL) goto Error;
         for (i = 0; i < histogram_image_xysize; ++i) {
           const int symbol_index = histogram_symbols[i] & 0xffff;
           histogram_argb[i] = (symbol_index << 8);
@@ -995,65 +1009,64 @@ static WebPEncodingError EncodeImageInternal(
 
         VP8LPutBits(bw, histogram_bits - 2, 3);
         err = EncodeImageNoHuffman(
-            bw, histogram_argb, hash_chain, refs_tmp, &refs_array[2],
+            bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
             VP8LSubSampleSize(width, histogram_bits),
             VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
         WebPSafeFree(histogram_argb);
         if (err != VP8_ENC_OK) goto Error;
       }
-    }
 
-    // Store Huffman codes.
-    {
-      int i;
-      int max_tokens = 0;
-      // Find maximum number of symbols for the huffman tree-set.
-      for (i = 0; i < 5 * histogram_image_size; ++i) {
-        HuffmanTreeCode* const codes = &huffman_codes[i];
-        if (max_tokens < codes->num_symbols) {
-          max_tokens = codes->num_symbols;
+      // Store Huffman codes.
+      {
+        int i;
+        int max_tokens = 0;
+        // Find maximum number of symbols for the huffman tree-set.
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          if (max_tokens < codes->num_symbols) {
+            max_tokens = codes->num_symbols;
+          }
+        }
+        tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+        if (tokens == NULL) goto Error;
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          StoreHuffmanCode(bw, huff_tree, tokens, codes);
+          ClearHuffmanTreeIfOnlyOneSymbol(codes);
         }
       }
-      tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
-      if (tokens == NULL) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-        goto Error;
+      // Store actual literals.
+      hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+      err = StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+                                histogram_symbols, huffman_codes);
+      if (err != VP8_ENC_OK) goto Error;
+      // Keep track of the smallest image so far.
+      if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
+        bw_size_best = VP8LBitWriterNumBytes(bw);
+        *cache_bits = cache_bits_tmp;
+        *hdr_size = hdr_size_tmp;
+        *data_size =
+            (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+        VP8LBitWriterSwap(bw, &bw_best);
       }
-      for (i = 0; i < 5 * histogram_image_size; ++i) {
-        HuffmanTreeCode* const codes = &huffman_codes[i];
-        StoreHuffmanCode(bw, huff_tree, tokens, codes);
-        ClearHuffmanTreeIfOnlyOneSymbol(codes);
+      WebPSafeFree(tokens);
+      tokens = NULL;
+      if (huffman_codes != NULL) {
+        WebPSafeFree(huffman_codes->codes);
+        WebPSafeFree(huffman_codes);
+        huffman_codes = NULL;
       }
     }
-    // Store actual literals.
-    hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
-    err = StoreImageToBitMask(bw, width, histogram_bits, refs_best,
-                              histogram_symbols, huffman_codes);
-    // Keep track of the smallest image so far.
-    if (lz77s_idx == 0 ||
-        VP8LBitWriterNumBytes(bw) < VP8LBitWriterNumBytes(&bw_best)) {
-      *hdr_size = hdr_size_tmp;
-      *data_size =
-          (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
-      VP8LBitWriterSwap(bw, &bw_best);
-    }
-    // Reset the bit writer for the following iteration if any.
-    if (config->lz77s_types_to_try_size_ > 1) VP8LBitWriterReset(&bw_init, bw);
-    WebPSafeFree(tokens);
-    tokens = NULL;
-    if (huffman_codes != NULL) {
-      WebPSafeFree(huffman_codes->codes);
-      WebPSafeFree(huffman_codes);
-      huffman_codes = NULL;
-    }
   }
   VP8LBitWriterSwap(bw, &bw_best);
+  err = VP8_ENC_OK;
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
   VP8LFreeHistogram(tmp_histo);
+  VP8LHashChainClear(&hash_chain_histogram);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes);
@@ -1095,8 +1108,7 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, pred_bits - 2, 3);
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
       quality, low_effort);
 }
 
@@ -1116,8 +1128,7 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
       quality, low_effort);
 }
 
@@ -1464,8 +1475,8 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
   }
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
-                              &enc->refs_[0], &enc->refs_[1], palette_size, 1,
-                              20 /* quality */, low_effort);
+                              &enc->refs_[0], palette_size, 1, /*quality=*/20,
+                              low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1491,7 +1502,7 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
   if (enc != NULL) {
     int i;
     VP8LHashChainClear(&enc->hash_chain_);
-    for (i = 0; i < 3; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
+    for (i = 0; i < 4; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
     ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
@@ -1541,7 +1552,7 @@ static int EncodeStreamHook(void* input, void* data2) {
   int data_size = 0;
   int use_delta_palette = 0;
   int idx;
-  size_t best_size = 0;
+  size_t best_size = ~(size_t)0;
   VP8LBitWriter bw_init = *bw, bw_best;
   (void)data2;
 
@@ -1553,11 +1564,13 @@ static int EncodeStreamHook(void* input, void* data2) {
 
   for (idx = 0; idx < num_crunch_configs; ++idx) {
     const int entropy_idx = crunch_configs[idx].entropy_idx_;
-    enc->use_palette_ = (entropy_idx == kPalette);
+    enc->use_palette_ =
+        (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
     enc->use_subtract_green_ =
         (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
-    enc->use_predict_ =
-        (entropy_idx == kSpatial) || (entropy_idx == kSpatialSubGreen);
+    enc->use_predict_ = (entropy_idx == kSpatial) ||
+                        (entropy_idx == kSpatialSubGreen) ||
+                        (entropy_idx == kPaletteAndSpatial);
     if (low_effort) {
       enc->use_cross_color_ = 0;
     } else {
@@ -1640,7 +1653,7 @@ static int EncodeStreamHook(void* input, void* data2) {
     if (err != VP8_ENC_OK) goto Error;
 
     // If we are better than what we already have.
-    if (idx == 0 || VP8LBitWriterNumBytes(bw) < best_size) {
+    if (VP8LBitWriterNumBytes(bw) < best_size) {
       best_size = VP8LBitWriterNumBytes(bw);
       // Store the BitWriter.
       VP8LBitWriterSwap(bw, &bw_best);
@@ -1816,7 +1829,7 @@ Error:
 }
 
 #undef CRUNCH_CONFIGS_MAX
-#undef CRUNCH_CONFIGS_LZ77_MAX
+#undef CRUNCH_SUBCONFIGS_MAX
 
 int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture) {
diff --git a/3rdparty/libwebp/src/enc/vp8li_enc.h b/3rdparty/libwebp/src/enc/vp8li_enc.h
index d2d0fc509c..94210ce9f3 100644
--- a/3rdparty/libwebp/src/enc/vp8li_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -71,7 +71,7 @@ typedef struct {
   uint32_t palette_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
diff --git a/3rdparty/libwebp/src/enc/webp_enc.c b/3rdparty/libwebp/src/enc/webp_enc.c
index 9f4b10c26c..ce2db2e94b 100644
--- a/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/3rdparty/libwebp/src/enc/webp_enc.c
@@ -400,7 +400,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
     }
 
     if (!config->exact) {
-      WebPCleanupTransparentAreaLossless(pic);
+      WebPReplaceTransparentPixels(pic, 0x000000);
     }
 
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
diff --git a/3rdparty/libwebp/src/mux/muxi.h b/3rdparty/libwebp/src/mux/muxi.h
index ad3e1bdb97..2289822e8f 100644
--- a/3rdparty/libwebp/src/mux/muxi.h
+++ b/3rdparty/libwebp/src/mux/muxi.h
@@ -28,7 +28,7 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 1
+#define MUX_MIN_VERSION 2
 #define MUX_REV_VERSION 0
 
 // Chunk object.
diff --git a/3rdparty/libwebp/src/mux/muxread.c b/3rdparty/libwebp/src/mux/muxread.c
index ae3b876bc5..0101fde15d 100644
--- a/3rdparty/libwebp/src/mux/muxread.c
+++ b/3rdparty/libwebp/src/mux/muxread.c
@@ -155,7 +155,6 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
         break;
       default:
         goto Fail;
-        break;
     }
     subchunk_size = ChunkDiskSize(&subchunk);
     bytes += subchunk_size;
@@ -264,7 +263,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
         if (!MuxImageParse(&chunk, copy_data, wpi)) goto Err;
         ChunkRelease(&chunk);
         goto PushImage;
-        break;
       default:  // A non-image chunk.
         if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
                                          // getting all chunks of an image.
diff --git a/3rdparty/libwebp/src/utils/utils.c b/3rdparty/libwebp/src/utils/utils.c
index 764f752b82..6080e19e21 100644
--- a/3rdparty/libwebp/src/utils/utils.c
+++ b/3rdparty/libwebp/src/utils/utils.c
@@ -231,7 +231,7 @@ void WebPFree(void* ptr) {
 void WebPCopyPlane(const uint8_t* src, int src_stride,
                    uint8_t* dst, int dst_stride, int width, int height) {
   assert(src != NULL && dst != NULL);
-  assert(src_stride >= width && dst_stride >= width);
+  assert(abs(src_stride) >= width && abs(dst_stride) >= width);
   while (height-- > 0) {
     memcpy(dst, src, width);
     src += src_stride;
diff --git a/3rdparty/libwebp/src/webp/decode.h b/3rdparty/libwebp/src/webp/decode.h
index 80dd0ef0cc..44fcd64a84 100644
--- a/3rdparty/libwebp/src/webp/decode.h
+++ b/3rdparty/libwebp/src/webp/decode.h
@@ -453,7 +453,7 @@ struct WebPDecoderOptions {
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
-  int flip;                           // flip output vertically
+  int flip;                           // if true, flip output vertically
   int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 
   uint32_t pad[5];                    // padding for later use
diff --git a/3rdparty/libwebp/src/webp/encode.h b/3rdparty/libwebp/src/webp/encode.h
index 655166e7d4..b4c599df87 100644
--- a/3rdparty/libwebp/src/webp/encode.h
+++ b/3rdparty/libwebp/src/webp/encode.h
@@ -148,7 +148,8 @@ struct WebPConfig {
   int use_delta_palette;  // reserved for future lossless feature
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
 
-  uint32_t pad[2];        // padding for later use
+  int qmin;               // minimum permissible quality factor
+  int qmax;               // maximum permissible quality factor
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -291,6 +292,11 @@ typedef enum WebPEncodingError {
 #define WEBP_MAX_DIMENSION 16383
 
 // Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
 struct WebPPicture {
   //   INPUT
   //////////////
diff --git a/3rdparty/openjpeg/CHANGELOG.md b/3rdparty/openjpeg/CHANGELOG.md
index e45b324658..4187b06730 100644
--- a/3rdparty/openjpeg/CHANGELOG.md
+++ b/3rdparty/openjpeg/CHANGELOG.md
@@ -1,5 +1,92 @@
 # Changelog
 
+## [v2.4.0](https://github.com/uclouvain/openjpeg/releases/v2.4.0) (2020-12-28)
+
+[Full Changelog](https://github.com/uclouvain/openjpeg/compare/v2.3.1...v2.4.0)
+
+**Closed issues:**
+
+- OPENJPEG\_INSTALL\_DOC\_DIR does not control a destination directory where HTML docs would be installed. [\#1309](https://github.com/uclouvain/openjpeg/issues/1309)
+- Heap-buffer-overflow in lib/openjp2/pi.c:312 [\#1302](https://github.com/uclouvain/openjpeg/issues/1302)
+- Heap-buffer-overflow in lib/openjp2/t2.c:973 [\#1299](https://github.com/uclouvain/openjpeg/issues/1299)
+- Heap-buffer-overflow in lib/openjp2/pi.c:623 [\#1293](https://github.com/uclouvain/openjpeg/issues/1293)
+- Global-buffer-overflow in lib/openjp2/dwt.c:1980 [\#1286](https://github.com/uclouvain/openjpeg/issues/1286)
+- Heap-buffer-overflow in lib/openjp2/tcd.c:2417 [\#1284](https://github.com/uclouvain/openjpeg/issues/1284)
+- Heap-buffer-overflow in lib/openjp2/mqc.c:499 [\#1283](https://github.com/uclouvain/openjpeg/issues/1283)
+- Openjpeg could not encode 32bit RGB float image [\#1281](https://github.com/uclouvain/openjpeg/issues/1281)
+- Openjpeg could not encode 32bit RGB float image [\#1280](https://github.com/uclouvain/openjpeg/issues/1280)
+- ISO/IEC 15444-1:2019 \(E\) compared with 'cio.h' [\#1277](https://github.com/uclouvain/openjpeg/issues/1277)
+- Test-suite failure due to hash mismatch [\#1264](https://github.com/uclouvain/openjpeg/issues/1264)
+- Heap use-after-free [\#1261](https://github.com/uclouvain/openjpeg/issues/1261)
+- Memory leak when failing to allocate object... [\#1259](https://github.com/uclouvain/openjpeg/issues/1259)
+- Memory leak of Tier 1 handle when OpenJPEG fails to set it as TLS... [\#1257](https://github.com/uclouvain/openjpeg/issues/1257)
+- Any plan to build release for CVE-2020-8112/CVE-2020-6851 [\#1247](https://github.com/uclouvain/openjpeg/issues/1247)
+- failing to convert 16-bit file: opj\_t2\_encode\_packet\(\): only 5251 bytes remaining in output buffer. 5621 needed. [\#1243](https://github.com/uclouvain/openjpeg/issues/1243)
+- CMake+VS2017 Compile OK, thirdparty Compile OK, but thirdparty not install [\#1239](https://github.com/uclouvain/openjpeg/issues/1239)
+- New release to solve CVE-2019-6988 ? [\#1238](https://github.com/uclouvain/openjpeg/issues/1238)
+- Many tests fail to pass after the update of libtiff to version 4.1.0 [\#1233](https://github.com/uclouvain/openjpeg/issues/1233)
+- Another heap buffer overflow in libopenjp2 [\#1231](https://github.com/uclouvain/openjpeg/issues/1231)
+- Heap buffer overflow in libopenjp2 [\#1228](https://github.com/uclouvain/openjpeg/issues/1228)
+- Endianness of binary volume \(JP3D\) [\#1224](https://github.com/uclouvain/openjpeg/issues/1224)
+- New release to resolve CVE-2019-12973 [\#1222](https://github.com/uclouvain/openjpeg/issues/1222)
+- how to set the block size,like 128,256 ? [\#1216](https://github.com/uclouvain/openjpeg/issues/1216)
+- compress YUV files to motion jpeg2000 standard [\#1213](https://github.com/uclouvain/openjpeg/issues/1213)
+- Repair/update Java wrapper, and include in release [\#1208](https://github.com/uclouvain/openjpeg/issues/1208)
+- abc [\#1206](https://github.com/uclouvain/openjpeg/issues/1206)
+- Slow decoding [\#1202](https://github.com/uclouvain/openjpeg/issues/1202)
+- Installation question [\#1201](https://github.com/uclouvain/openjpeg/issues/1201)
+- Typo in test\_decode\_area - \*ptilew is assigned instead of \*ptileh [\#1195](https://github.com/uclouvain/openjpeg/issues/1195)
+- Creating a J2K file with one POC is broken [\#1191](https://github.com/uclouvain/openjpeg/issues/1191)
+- Make fails on Arch Linux [\#1174](https://github.com/uclouvain/openjpeg/issues/1174)
+- Heap buffer overflow in opj\_t1\_clbl\_decode\_processor\(\) triggered with Ghostscript [\#1158](https://github.com/uclouvain/openjpeg/issues/1158)
+- opj\_stream\_get\_number\_byte\_left: Assertion `p\_stream-\>m\_byte\_offset \>= 0' failed. [\#1151](https://github.com/uclouvain/openjpeg/issues/1151)
+- The fuzzer ignores too many inputs [\#1079](https://github.com/uclouvain/openjpeg/issues/1079)
+- out of bounds read [\#1068](https://github.com/uclouvain/openjpeg/issues/1068)
+
+**Merged pull requests:**
+
+- Change defined WIN32 [\#1310](https://github.com/uclouvain/openjpeg/pull/1310) ([Jamaika1](https://github.com/Jamaika1))
+- docs: fix simple typo, producted -\> produced [\#1308](https://github.com/uclouvain/openjpeg/pull/1308) ([timgates42](https://github.com/timgates42))
+- Set ${OPENJPEG\_INSTALL\_DOC\_DIR} to DESTINATION of HTMLs [\#1307](https://github.com/uclouvain/openjpeg/pull/1307) ([lemniscati](https://github.com/lemniscati))
+- Use INC\_DIR for OPENJPEG\_INCLUDE\_DIRS \(fixes uclouvain\#1174\) [\#1306](https://github.com/uclouvain/openjpeg/pull/1306) ([matthew-sharp](https://github.com/matthew-sharp))
+- pi.c: avoid out of bounds access with POC \(fixes \#1302\) [\#1304](https://github.com/uclouvain/openjpeg/pull/1304) ([rouault](https://github.com/rouault))
+- Encoder: grow again buffer size [\#1303](https://github.com/uclouvain/openjpeg/pull/1303) ([zodf0055980](https://github.com/zodf0055980))
+- opj\_j2k\_write\_sod\(\): avoid potential heap buffer overflow \(fixes \#1299\) \(probably master only\) [\#1301](https://github.com/uclouvain/openjpeg/pull/1301) ([rouault](https://github.com/rouault))
+- pi.c: avoid out of bounds access with POC \(refs https://github.com/uclouvain/openjpeg/issues/1293\#issuecomment-737122836\) [\#1300](https://github.com/uclouvain/openjpeg/pull/1300) ([rouault](https://github.com/rouault))
+- opj\_t2\_encode\_packet\(\): avoid out of bound access of \#1297, but likely not the proper fix [\#1298](https://github.com/uclouvain/openjpeg/pull/1298) ([rouault](https://github.com/rouault))
+- opj\_t2\_encode\_packet\(\): avoid out of bound access of \#1294, but likely not the proper fix [\#1296](https://github.com/uclouvain/openjpeg/pull/1296) ([rouault](https://github.com/rouault))
+- opj\_j2k\_setup\_encoder\(\): validate POC compno0 and compno1 \(fixes \#1293\) [\#1295](https://github.com/uclouvain/openjpeg/pull/1295) ([rouault](https://github.com/rouault))
+- Encoder: avoid global buffer overflow on irreversible conversion when… [\#1292](https://github.com/uclouvain/openjpeg/pull/1292) ([rouault](https://github.com/rouault))
+- Decoding: deal with some SPOT6 images that have tiles with a single tile-part with TPsot == 0 and TNsot == 0, and with missing EOC [\#1291](https://github.com/uclouvain/openjpeg/pull/1291) ([rouault](https://github.com/rouault))
+- Free p\_tcd\_marker\_info to avoid memory leak [\#1288](https://github.com/uclouvain/openjpeg/pull/1288) ([zodf0055980](https://github.com/zodf0055980))
+- Encoder: grow again buffer size [\#1287](https://github.com/uclouvain/openjpeg/pull/1287) ([zodf0055980](https://github.com/zodf0055980))
+- Encoder: avoid uint32 overflow when allocating memory for codestream buffer \(fixes \#1243\) [\#1276](https://github.com/uclouvain/openjpeg/pull/1276) ([rouault](https://github.com/rouault))
+- Java compatibility from 1.5 to 1.6 [\#1263](https://github.com/uclouvain/openjpeg/pull/1263) ([jiapei100](https://github.com/jiapei100))
+- opj\_decompress: fix double-free on input directory with mix of valid and invalid images [\#1262](https://github.com/uclouvain/openjpeg/pull/1262) ([rouault](https://github.com/rouault))
+- openjp2: Plug image leak when failing to allocate codestream index. [\#1260](https://github.com/uclouvain/openjpeg/pull/1260) ([sebras](https://github.com/sebras))
+- openjp2: Plug memory leak when setting data as TLS fails. [\#1258](https://github.com/uclouvain/openjpeg/pull/1258) ([sebras](https://github.com/sebras))
+- openjp2: Error out if failing to create Tier 1 handle. [\#1256](https://github.com/uclouvain/openjpeg/pull/1256) ([sebras](https://github.com/sebras))
+- Testing for invalid values of width, height, numcomps [\#1254](https://github.com/uclouvain/openjpeg/pull/1254) ([szukw000](https://github.com/szukw000))
+- Single-threaded performance improvements in forward DWT for 5-3 and 9-7 \(and other improvements\) [\#1253](https://github.com/uclouvain/openjpeg/pull/1253) ([rouault](https://github.com/rouault))
+- Add support for multithreading in encoder [\#1248](https://github.com/uclouvain/openjpeg/pull/1248) ([rouault](https://github.com/rouault))
+- Add support for generation of PLT markers in encoder [\#1246](https://github.com/uclouvain/openjpeg/pull/1246) ([rouault](https://github.com/rouault))
+- Fix warnings about signed/unsigned casts in pi.c [\#1244](https://github.com/uclouvain/openjpeg/pull/1244) ([rouault](https://github.com/rouault))
+- opj\_decompress: add sanity checks to avoid segfault in case of decoding error [\#1240](https://github.com/uclouvain/openjpeg/pull/1240) ([rouault](https://github.com/rouault))
+- ignore wrong icc [\#1236](https://github.com/uclouvain/openjpeg/pull/1236) ([szukw000](https://github.com/szukw000))
+- Implement writing of IMF profiles [\#1235](https://github.com/uclouvain/openjpeg/pull/1235) ([rouault](https://github.com/rouault))
+- tests: add alternate checksums for libtiff 4.1 [\#1234](https://github.com/uclouvain/openjpeg/pull/1234) ([rouault](https://github.com/rouault))
+- opj\_tcd\_init\_tile\(\): avoid integer overflow [\#1232](https://github.com/uclouvain/openjpeg/pull/1232) ([rouault](https://github.com/rouault))
+- tests/fuzzers: link fuzz binaries using $LIB\_FUZZING\_ENGINE. [\#1230](https://github.com/uclouvain/openjpeg/pull/1230) ([Dor1s](https://github.com/Dor1s))
+- opj\_j2k\_update\_image\_dimensions\(\): reject images whose coordinates are beyond INT\_MAX \(fixes \#1228\) [\#1229](https://github.com/uclouvain/openjpeg/pull/1229) ([rouault](https://github.com/rouault))
+- Fix resource leaks [\#1226](https://github.com/uclouvain/openjpeg/pull/1226) ([dodys](https://github.com/dodys))
+- abi-check.sh: fix false postive ABI error, and display output error log [\#1218](https://github.com/uclouvain/openjpeg/pull/1218) ([rouault](https://github.com/rouault))
+- pi.c: avoid integer overflow, resulting in later invalid access to memory in opj\_t2\_decode\_packets\(\) [\#1217](https://github.com/uclouvain/openjpeg/pull/1217) ([rouault](https://github.com/rouault))
+- Add check to validate SGcod/SPcoc/SPcod parameter values. [\#1211](https://github.com/uclouvain/openjpeg/pull/1211) ([sebras](https://github.com/sebras))
+- Fix buffer overflow reading an image file less than four characters [\#1196](https://github.com/uclouvain/openjpeg/pull/1196) ([robert-ancell](https://github.com/robert-ancell))
+- compression: emit POC marker when only one single POC is requested \(f… [\#1192](https://github.com/uclouvain/openjpeg/pull/1192) ([rouault](https://github.com/rouault))
+- Fix several potential vulnerabilities  [\#1185](https://github.com/uclouvain/openjpeg/pull/1185) ([Young-X](https://github.com/Young-X))
+- openjp2/j2k: Report error if all wanted components are not decoded. [\#1164](https://github.com/uclouvain/openjpeg/pull/1164) ([sebras](https://github.com/sebras))
+
 ## [v2.3.1](https://github.com/uclouvain/openjpeg/releases/v2.3.1) (2019-04-02)
 [Full Changelog](https://github.com/uclouvain/openjpeg/compare/v2.3.0...v2.3.1)
 
diff --git a/3rdparty/openjpeg/CMakeLists.txt b/3rdparty/openjpeg/CMakeLists.txt
index b38bf28f05..fe766101d0 100644
--- a/3rdparty/openjpeg/CMakeLists.txt
+++ b/3rdparty/openjpeg/CMakeLists.txt
@@ -18,8 +18,8 @@ ocv_warnings_disable(CMAKE_C_FLAGS
 #-----------------------------------------------------------------------------
 # OPENJPEG version number, useful for packaging and doxygen doc:
 set(OPENJPEG_VERSION_MAJOR 2)
-set(OPENJPEG_VERSION_MINOR 3)
-set(OPENJPEG_VERSION_BUILD 1)
+set(OPENJPEG_VERSION_MINOR 4)
+set(OPENJPEG_VERSION_BUILD 0)
 set(OPENJPEG_VERSION
   "${OPENJPEG_VERSION_MAJOR}.${OPENJPEG_VERSION_MINOR}.${OPENJPEG_VERSION_BUILD}")
 set(PACKAGE_VERSION
@@ -43,6 +43,7 @@ set(PACKAGE_VERSION
 #   2.2.0 |  7
 #   2.3.0 |  7
 #   2.3.1 |  7
+#   2.4.0 |  7
 # above is the recommendation by the OPJ team. If you really need to override this default,
 # you can specify your own OPENJPEG_SOVERSION at cmake configuration time:
 # cmake -DOPENJPEG_SOVERSION:STRING=42 /path/to/openjpeg
diff --git a/3rdparty/openjpeg/openjp2/CMakeLists.txt b/3rdparty/openjpeg/openjp2/CMakeLists.txt
index 7decabe210..321d318642 100644
--- a/3rdparty/openjpeg/openjp2/CMakeLists.txt
+++ b/3rdparty/openjpeg/openjp2/CMakeLists.txt
@@ -33,7 +33,11 @@ endif()
 #   set(WIN32 YES)
 # endif()
 
-ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wstrict-prototypes -Wcast-function-type)
+ocv_warnings_disable(CMAKE_C_FLAGS
+    -Wundef -Wstrict-prototypes -Wcast-function-type
+    -Wshadow   # v2.4.0: GCC
+    -Wunused-function   # v2.4.0: Clang
+)
 
 add_library(${OPENJPEG_LIBRARY_NAME} STATIC ${OPENJPEG_SRCS})
 
diff --git a/3rdparty/openjpeg/openjp2/dwt.c b/3rdparty/openjpeg/openjp2/dwt.c
index 5930d1c71e..4164ba090e 100644
--- a/3rdparty/openjpeg/openjp2/dwt.c
+++ b/3rdparty/openjpeg/openjp2/dwt.c
@@ -87,12 +87,14 @@ typedef struct dwt_local {
     OPJ_INT32 cas;  /* 0 = start on even coord, 1 = start on odd coord */
 } opj_dwt_t;
 
-typedef union {
-    OPJ_FLOAT32 f[4];
-} opj_v4_t;
+#define NB_ELTS_V8  8
 
-typedef struct v4dwt_local {
-    opj_v4_t*   wavelet ;
+typedef union {
+    OPJ_FLOAT32 f[NB_ELTS_V8];
+} opj_v8_t;
+
+typedef struct v8dwt_local {
+    opj_v8_t*   wavelet ;
     OPJ_INT32       dn ;  /* number of elements in high pass band */
     OPJ_INT32       sn ;  /* number of elements in low pass band */
     OPJ_INT32       cas ; /* 0 = start on even coord, 1 = start on odd coord */
@@ -100,45 +102,34 @@ typedef struct v4dwt_local {
     OPJ_UINT32      win_l_x1; /* end coord in low pass band */
     OPJ_UINT32      win_h_x0; /* start coord in high pass band */
     OPJ_UINT32      win_h_x1; /* end coord in high pass band */
-} opj_v4dwt_t ;
+} opj_v8dwt_t ;
 
-static const OPJ_FLOAT32 opj_dwt_alpha =  1.586134342f; /*  12994 */
-static const OPJ_FLOAT32 opj_dwt_beta  =  0.052980118f; /*    434 */
-static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /*  -7233 */
-static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /*  -3633 */
+/* From table F.4 from the standard */
+static const OPJ_FLOAT32 opj_dwt_alpha =  -1.586134342f;
+static const OPJ_FLOAT32 opj_dwt_beta  =  -0.052980118f;
+static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
+static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
 
-static const OPJ_FLOAT32 opj_K      = 1.230174105f; /*  10078 */
-static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
+static const OPJ_FLOAT32 opj_K      = 1.230174105f;
+static const OPJ_FLOAT32 opj_invK   = (OPJ_FLOAT32)(1.0 / 1.230174105);
 
 /*@}*/
 
-/**
-Virtual function type for wavelet transform in 1-D
-*/
-typedef void (*DWT1DFN)(const opj_dwt_t* v);
-
 /** @name Local static functions */
 /*@{*/
 
 /**
 Forward lazy transform (horizontal)
 */
-static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
+static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
+                                   OPJ_INT32 * OPJ_RESTRICT b,
+                                   OPJ_INT32 dn,
                                    OPJ_INT32 sn, OPJ_INT32 cas);
-/**
-Forward lazy transform (vertical)
-*/
-static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
-                                   OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
-/**
-Forward 5-3 wavelet transform in 1-D
-*/
-static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
-                             OPJ_INT32 cas);
+
 /**
 Forward 9-7 wavelet transform in 1-D
 */
-static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
+static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
                                   OPJ_INT32 cas);
 /**
 Explicit calculation of the Quantization Stepsizes
@@ -155,8 +146,29 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
     opj_tcd_tilecomp_t* tilec,
     OPJ_UINT32 numres);
 
-static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
-        void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
+/* Forward transform, for the vertical pass, processing cols columns */
+/* where cols <= NB_ELTS_V8 */
+/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
+typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
+    void *array,
+    void *tmp,
+    OPJ_UINT32 height,
+    OPJ_BOOL even,
+    OPJ_UINT32 stride_width,
+    OPJ_UINT32 cols);
+
+/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
+typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
+    void *row,
+    void *tmp,
+    OPJ_UINT32 width,
+    OPJ_BOOL even);
+
+static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
+        opj_tcd_tilecomp_t * tilec,
+        opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
+        opj_encode_and_deinterleave_h_one_row_fnptr_type
+        p_encode_and_deinterleave_h_one_row);
 
 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
         OPJ_UINT32 i);
@@ -164,42 +176,6 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
-static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
-
-static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
-                                   OPJ_FLOAT32* OPJ_RESTRICT a,
-                                   OPJ_UINT32 width,
-                                   OPJ_UINT32 remaining_height);
-
-static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
-                                   OPJ_FLOAT32* OPJ_RESTRICT a,
-                                   OPJ_UINT32 width,
-                                   OPJ_UINT32 nb_elts_read);
-
-#ifdef __SSE__
-static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
-                                       OPJ_UINT32 start,
-                                       OPJ_UINT32 end,
-                                       const __m128 c);
-
-static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
-                                       OPJ_UINT32 start,
-                                       OPJ_UINT32 end,
-                                       OPJ_UINT32 m, __m128 c);
-
-#else
-static void opj_v4dwt_decode_step1(opj_v4_t* w,
-                                   OPJ_UINT32 start,
-                                   OPJ_UINT32 end,
-                                   const OPJ_FLOAT32 c);
-
-static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
-                                   OPJ_UINT32 start,
-                                   OPJ_UINT32 end,
-                                   OPJ_UINT32 m,
-                                   OPJ_FLOAT32 c);
-
-#endif
 
 /*@}*/
 
@@ -246,12 +222,14 @@ static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
 /* <summary>                             */
 /* Forward lazy transform (horizontal).  */
 /* </summary>                            */
-static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
+static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
+                                   OPJ_INT32 * OPJ_RESTRICT b,
+                                   OPJ_INT32 dn,
                                    OPJ_INT32 sn, OPJ_INT32 cas)
 {
     OPJ_INT32 i;
-    OPJ_INT32 * l_dest = b;
-    OPJ_INT32 * l_src = a + cas;
+    OPJ_INT32 * OPJ_RESTRICT l_dest = b;
+    const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
 
     for (i = 0; i < sn; ++i) {
         *l_dest++ = *l_src;
@@ -267,40 +245,13 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
     }
 }
 
-/* <summary>                             */
-/* Forward lazy transform (vertical).    */
-/* </summary>                            */
-static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
-                                   OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
-{
-    OPJ_INT32 i = sn;
-    OPJ_INT32 * l_dest = b;
-    OPJ_INT32 * l_src = a + cas;
-
-    while (i--) {
-        *l_dest = *l_src;
-        l_dest += x;
-        l_src += 2;
-    } /* b[i*x]=a[2*i+cas]; */
-
-    l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x;
-    l_src = a + 1 - cas;
-
-    i = dn;
-    while (i--) {
-        *l_dest = *l_src;
-        l_dest += x;
-        l_src += 2;
-    } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
-}
-
 #ifdef STANDARD_SLOW_VERSION
 /* <summary>                             */
 /* Inverse lazy transform (horizontal).  */
 /* </summary>                            */
 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 {
-    OPJ_INT32 *ai = a;
+    const OPJ_INT32 *ai = a;
     OPJ_INT32 *bi = h->mem + h->cas;
     OPJ_INT32  i    = h->sn;
     while (i--) {
@@ -321,7 +272,7 @@ static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 /* </summary>                            */
 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 {
-    OPJ_INT32 *ai = a;
+    const OPJ_INT32 *ai = a;
     OPJ_INT32 *bi = v->mem + v->cas;
     OPJ_INT32  i = v->sn;
     while (i--) {
@@ -341,37 +292,6 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 
 #endif /* STANDARD_SLOW_VERSION */
 
-/* <summary>                            */
-/* Forward 5-3 wavelet transform in 1-D. */
-/* </summary>                           */
-static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
-                             OPJ_INT32 cas)
-{
-    OPJ_INT32 i;
-
-    if (!cas) {
-        if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
-            for (i = 0; i < dn; i++) {
-                OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
-            }
-        }
-    } else {
-        if (!sn && dn == 1) {       /* NEW :  CASE ONE ELEMENT */
-            OPJ_S(0) *= 2;
-        } else {
-            for (i = 0; i < dn; i++) {
-                OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
-            }
-        }
-    }
-}
-
 #ifdef STANDARD_SLOW_VERSION
 /* <summary>                            */
 /* Inverse 5-3 wavelet transform in 1-D. */
@@ -1033,57 +953,137 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
 #endif
 }
 
-
-/* <summary>                             */
-/* Forward 9-7 wavelet transform in 1-D. */
-/* </summary>                            */
-static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
-                                  OPJ_INT32 cas)
+#if 0
+static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
+                                 OPJ_UINT32 end,
+                                 const OPJ_FLOAT32 c)
 {
-    OPJ_INT32 i;
-    if (!cas) {
-        if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
-            for (i = 0; i < dn; i++) {
-                OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
-            }
-            for (i = 0; i < dn; i++) {
-                OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
-            }
-            for (i = 0; i < dn; i++) {
-                OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038);    /*5038 */
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659);    /*6660 */
-            }
+    OPJ_UINT32 i = 0;
+    for (; i < end; ++i) {
+        fw[0] *= c;
+        fw += 2;
+    }
+}
+#else
+static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
+        OPJ_UINT32 iters_c1,
+        OPJ_UINT32 iters_c2,
+        const OPJ_FLOAT32 c1,
+        const OPJ_FLOAT32 c2)
+{
+    OPJ_UINT32 i = 0;
+    const OPJ_UINT32 iters_common =  opj_uint_min(iters_c1, iters_c2);
+    assert((((OPJ_SIZE_T)fw) & 0xf) == 0);
+    assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1);
+    for (; i + 3 < iters_common; i += 4) {
+#ifdef __SSE__
+        const __m128 vcst = _mm_set_ps(c2, c1, c2, c1);
+        *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst);
+        *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst);
+#else
+        fw[0] *= c1;
+        fw[1] *= c2;
+        fw[2] *= c1;
+        fw[3] *= c2;
+        fw[4] *= c1;
+        fw[5] *= c2;
+        fw[6] *= c1;
+        fw[7] *= c2;
+#endif
+        fw += 8;
+    }
+    for (; i < iters_common; i++) {
+        fw[0] *= c1;
+        fw[1] *= c2;
+        fw += 2;
+    }
+    if (i < iters_c1) {
+        fw[0] *= c1;
+    } else if (i < iters_c2) {
+        fw[1] *= c2;
+    }
+}
+
+#endif
+
+static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
+                                 OPJ_UINT32 end,
+                                 OPJ_UINT32 m,
+                                 OPJ_FLOAT32 c)
+{
+    OPJ_UINT32 i;
+    OPJ_UINT32 imax = opj_uint_min(end, m);
+    if (imax > 0) {
+        fw[-1] += (fl[0] + fw[0]) * c;
+        fw += 2;
+        i = 1;
+        for (; i + 3 < imax; i += 4) {
+            fw[-1] += (fw[-2] + fw[0]) * c;
+            fw[1] += (fw[0] + fw[2]) * c;
+            fw[3] += (fw[2] + fw[4]) * c;
+            fw[5] += (fw[4] + fw[6]) * c;
+            fw += 8;
         }
-    } else {
-        if ((sn > 0) || (dn > 1)) { /* NEW :  CASE ONE ELEMENT */
-            for (i = 0; i < dn; i++) {
-                OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
-            }
-            for (i = 0; i < dn; i++) {
-                OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
-            }
-            for (i = 0; i < dn; i++) {
-                OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038);    /*5038 */
-            }
-            for (i = 0; i < sn; i++) {
-                OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659);    /*6660 */
-            }
+        for (; i < imax; ++i) {
+            fw[-1] += (fw[-2] + fw[0]) * c;
+            fw += 2;
         }
     }
+    if (m < end) {
+        assert(m + 1 == end);
+        fw[-1] += (2 * fw[-2]) * c;
+    }
+}
+
+static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
+                                  OPJ_INT32 cas)
+{
+    OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
+    OPJ_INT32 a, b;
+    assert(dn + sn > 1);
+    if (cas == 0) {
+        a = 0;
+        b = 1;
+    } else {
+        a = 1;
+        b = 0;
+    }
+    opj_dwt_encode_step2(w + a, w + b + 1,
+                         (OPJ_UINT32)dn,
+                         (OPJ_UINT32)opj_int_min(dn, sn - b),
+                         opj_dwt_alpha);
+    opj_dwt_encode_step2(w + b, w + a + 1,
+                         (OPJ_UINT32)sn,
+                         (OPJ_UINT32)opj_int_min(sn, dn - a),
+                         opj_dwt_beta);
+    opj_dwt_encode_step2(w + a, w + b + 1,
+                         (OPJ_UINT32)dn,
+                         (OPJ_UINT32)opj_int_min(dn, sn - b),
+                         opj_dwt_gamma);
+    opj_dwt_encode_step2(w + b, w + a + 1,
+                         (OPJ_UINT32)sn,
+                         (OPJ_UINT32)opj_int_min(sn, dn - a),
+                         opj_dwt_delta);
+#if 0
+    opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn,
+                         opj_K);
+    opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn,
+                         opj_invK);
+#else
+    if (a == 0) {
+        opj_dwt_encode_step1_combined(w,
+                                      (OPJ_UINT32)sn,
+                                      (OPJ_UINT32)dn,
+                                      opj_invK,
+                                      opj_K);
+    } else {
+        opj_dwt_encode_step1_combined(w,
+                                      (OPJ_UINT32)dn,
+                                      (OPJ_UINT32)sn,
+                                      opj_K,
+                                      opj_invK);
+    }
+#endif
 }
 
 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
@@ -1102,41 +1102,650 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 ==========================================================
 */
 
+/** Process one line for the horizontal pass of the 5x3 forward transform */
+static
+void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
+        void* tmpIn,
+        OPJ_UINT32 width,
+        OPJ_BOOL even)
+{
+    OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
+    OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
+    const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
+    const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
+
+    if (even) {
+        if (width > 1) {
+            OPJ_INT32 i;
+            for (i = 0; i < sn - 1; i++) {
+                tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
+            }
+            if ((width % 2) == 0) {
+                tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
+            }
+            row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
+            for (i = 1; i < dn; i++) {
+                row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
+            }
+            if ((width % 2) == 1) {
+                row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
+            }
+            memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
+        }
+    } else {
+        if (width == 1) {
+            row[0] *= 2;
+        } else {
+            OPJ_INT32 i;
+            tmp[sn + 0] = row[0] - row[1];
+            for (i = 1; i < sn; i++) {
+                tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
+            }
+            if ((width % 2) == 1) {
+                tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
+            }
+
+            for (i = 0; i < dn - 1; i++) {
+                row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
+            }
+            if ((width % 2) == 0) {
+                row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
+            }
+            memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
+        }
+    }
+}
+
+/** Process one line for the horizontal pass of the 9x7 forward transform */
+static
+void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
+        void* tmpIn,
+        OPJ_UINT32 width,
+        OPJ_BOOL even)
+{
+    OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
+    OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
+    const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
+    const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
+    if (width == 1) {
+        return;
+    }
+    memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
+    opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
+    opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
+                           (OPJ_INT32 * OPJ_RESTRICT)row,
+                           dn, sn, even ? 0 : 1);
+}
+
+typedef struct {
+    opj_dwt_t h;
+    OPJ_UINT32 rw; /* Width of the resolution to process */
+    OPJ_UINT32 w; /* Width of tiledp */
+    OPJ_INT32 * OPJ_RESTRICT tiledp;
+    OPJ_UINT32 min_j;
+    OPJ_UINT32 max_j;
+    opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
+} opj_dwt_encode_h_job_t;
+
+static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
+{
+    OPJ_UINT32 j;
+    opj_dwt_encode_h_job_t* job;
+    (void)tls;
+
+    job = (opj_dwt_encode_h_job_t*)user_data;
+    for (j = job->min_j; j < job->max_j; j++) {
+        OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
+        (*job->p_function)(aj, job->h.mem, job->rw,
+                           job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
+    }
+
+    opj_aligned_free(job->h.mem);
+    opj_free(job);
+}
+
+typedef struct {
+    opj_dwt_t v;
+    OPJ_UINT32 rh;
+    OPJ_UINT32 w;
+    OPJ_INT32 * OPJ_RESTRICT tiledp;
+    OPJ_UINT32 min_j;
+    OPJ_UINT32 max_j;
+    opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
+} opj_dwt_encode_v_job_t;
+
+static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
+{
+    OPJ_UINT32 j;
+    opj_dwt_encode_v_job_t* job;
+    (void)tls;
+
+    job = (opj_dwt_encode_v_job_t*)user_data;
+    for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
+        (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
+                                            job->v.mem,
+                                            job->rh,
+                                            job->v.cas == 0,
+                                            job->w,
+                                            NB_ELTS_V8);
+    }
+    if (j < job->max_j) {
+        (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
+                                            job->v.mem,
+                                            job->rh,
+                                            job->v.cas == 0,
+                                            job->w,
+                                            job->max_j - j);
+    }
+
+    opj_aligned_free(job->v.mem);
+    opj_free(job);
+}
+
+/** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
+/* that has a NB_ELTS_V8 interleave factor. */
+static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
+        void *tmpOut,
+        OPJ_UINT32 height,
+        OPJ_UINT32 stride_width,
+        OPJ_UINT32 cols)
+{
+    const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
+    OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
+    if (cols == NB_ELTS_V8) {
+        OPJ_UINT32 k;
+        for (k = 0; k < height; ++k) {
+            memcpy(tmp + NB_ELTS_V8 * k,
+                   array + k * stride_width,
+                   NB_ELTS_V8 * sizeof(OPJ_INT32));
+        }
+    } else {
+        OPJ_UINT32 k;
+        for (k = 0; k < height; ++k) {
+            OPJ_UINT32 c;
+            for (c = 0; c < cols; c++) {
+                tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
+            }
+            for (; c < NB_ELTS_V8; c++) {
+                tmp[NB_ELTS_V8 * k + c] = 0;
+            }
+        }
+    }
+}
+
+/* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
+/* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
+/* columns. */
+static INLINE void opj_dwt_deinterleave_v_cols(
+    const OPJ_INT32 * OPJ_RESTRICT src,
+    OPJ_INT32 * OPJ_RESTRICT dst,
+    OPJ_INT32 dn,
+    OPJ_INT32 sn,
+    OPJ_UINT32 stride_width,
+    OPJ_INT32 cas,
+    OPJ_UINT32 cols)
+{
+    OPJ_INT32 k;
+    OPJ_INT32 i = sn;
+    OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
+    const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
+    OPJ_UINT32 c;
+
+    for (k = 0; k < 2; k++) {
+        while (i--) {
+            if (cols == NB_ELTS_V8) {
+                memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
+            } else {
+                c = 0;
+                switch (cols) {
+                case 7:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                case 6:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                case 5:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                case 4:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                case 3:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                case 2:
+                    l_dest[c] = l_src[c];
+                    c++; /* fallthru */
+                default:
+                    l_dest[c] = l_src[c];
+                    break;
+                }
+            }
+            l_dest += stride_width;
+            l_src += 2 * NB_ELTS_V8;
+        }
+
+        l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
+        l_src = src + (1 - cas) * NB_ELTS_V8;
+        i = dn;
+    }
+}
+
+
+/* Forward 5-3 transform, for the vertical pass, processing cols columns */
+/* where cols <= NB_ELTS_V8 */
+static void opj_dwt_encode_and_deinterleave_v(
+    void *arrayIn,
+    void *tmpIn,
+    OPJ_UINT32 height,
+    OPJ_BOOL even,
+    OPJ_UINT32 stride_width,
+    OPJ_UINT32 cols)
+{
+    OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
+    OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
+    const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
+    const OPJ_UINT32 dn = height - sn;
+
+    opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
+
+#define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
+#define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
+
+#ifdef __SSE2__
+    if (height == 1) {
+        if (!even) {
+            OPJ_UINT32 c;
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                tmp[c] *= 2;
+            }
+        }
+    } else if (even) {
+        OPJ_UINT32 c;
+        OPJ_UINT32 i;
+        i = 0;
+        if (i + 1 < sn) {
+            __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
+            __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
+            for (; i + 1 < sn; i++) {
+                __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
+                                                       (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
+                                                       (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
+                __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+                xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
+                                         _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
+                xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
+                                         _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
+                *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) =  xmm_Di_0;
+                *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) =  xmm_Di_1;
+                xmm_Si_0 = xmm_Sip1_0;
+                xmm_Si_1 = xmm_Sip1_1;
+            }
+        }
+        if (((height) % 2) == 0) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Dc(i) -= OPJ_Sc(i);
+            }
+        }
+        for (c = 0; c < NB_ELTS_V8; c++) {
+            OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
+        }
+        i = 1;
+        if (i < dn) {
+            __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
+                                                   (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
+            __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
+                                                   (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
+            const __m128i xmm_two = _mm_set1_epi32(2);
+            for (; i < dn; i++) {
+                __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+                __m128i xmm_Si_0 = *(const __m128i*)(tmp +
+                                                     (i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Si_1 = *(const __m128i*)(tmp +
+                                                     (i * 2) * NB_ELTS_V8 + 4 * 1);
+                xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
+                                         _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
+                xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
+                                         _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
+                *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
+                *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
+                xmm_Dim1_0 = xmm_Di_0;
+                xmm_Dim1_1 = xmm_Di_1;
+            }
+        }
+        if (((height) % 2) == 1) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
+            }
+        }
+    } else {
+        OPJ_UINT32 c;
+        OPJ_UINT32 i;
+        for (c = 0; c < NB_ELTS_V8; c++) {
+            OPJ_Sc(0) -= OPJ_Dc(0);
+        }
+        i = 1;
+        if (i < sn) {
+            __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
+                                                   (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
+            __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
+                                                   (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
+            for (; i < sn; i++) {
+                __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+                __m128i xmm_Si_0 = *(const __m128i*)(tmp +
+                                                     (i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Si_1 = *(const __m128i*)(tmp +
+                                                     (i * 2) * NB_ELTS_V8 + 4 * 1);
+                xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
+                                         _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
+                xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
+                                         _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
+                *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
+                *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
+                xmm_Dim1_0 = xmm_Di_0;
+                xmm_Dim1_1 = xmm_Di_1;
+            }
+        }
+        if (((height) % 2) == 1) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Sc(i) -= OPJ_Dc(i - 1);
+            }
+        }
+        i = 0;
+        if (i + 1 < dn) {
+            __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
+            __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
+            const __m128i xmm_two = _mm_set1_epi32(2);
+            for (; i + 1 < dn; i++) {
+                __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
+                                                       (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
+                                                       (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
+                __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+                __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+                                                     (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+                xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
+                                         _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
+                xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
+                                         _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
+                *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
+                *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
+                xmm_Si_0 = xmm_Sip1_0;
+                xmm_Si_1 = xmm_Sip1_1;
+            }
+        }
+        if (((height) % 2) == 0) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
+            }
+        }
+    }
+#else
+    if (even) {
+        OPJ_UINT32 c;
+        if (height > 1) {
+            OPJ_UINT32 i;
+            for (i = 0; i + 1 < sn; i++) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
+                }
+            }
+            if (((height) % 2) == 0) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Dc(i) -= OPJ_Sc(i);
+                }
+            }
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
+            }
+            for (i = 1; i < dn; i++) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
+                }
+            }
+            if (((height) % 2) == 1) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
+                }
+            }
+        }
+    } else {
+        OPJ_UINT32 c;
+        if (height == 1) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Sc(0) *= 2;
+            }
+        } else {
+            OPJ_UINT32 i;
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                OPJ_Sc(0) -= OPJ_Dc(0);
+            }
+            for (i = 1; i < sn; i++) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
+                }
+            }
+            if (((height) % 2) == 1) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Sc(i) -= OPJ_Dc(i - 1);
+                }
+            }
+            for (i = 0; i + 1 < dn; i++) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
+                }
+            }
+            if (((height) % 2) == 0) {
+                for (c = 0; c < NB_ELTS_V8; c++) {
+                    OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
+                }
+            }
+        }
+    }
+#endif
+
+    if (cols == NB_ELTS_V8) {
+        opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
+                                    stride_width, even ? 0 : 1, NB_ELTS_V8);
+    } else {
+        opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
+                                    stride_width, even ? 0 : 1, cols);
+    }
+}
+
+static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
+                                   OPJ_UINT32 end,
+                                   const OPJ_FLOAT32 cst)
+{
+    OPJ_UINT32 i;
+#ifdef __SSE__
+    __m128* vw = (__m128*) fw;
+    const __m128 vcst = _mm_set1_ps(cst);
+    for (i = 0; i < end; ++i) {
+        vw[0] = _mm_mul_ps(vw[0], vcst);
+        vw[1] = _mm_mul_ps(vw[1], vcst);
+        vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+    }
+#else
+    OPJ_UINT32 c;
+    for (i = 0; i < end; ++i) {
+        for (c = 0; c < NB_ELTS_V8; c++) {
+            fw[i * 2 * NB_ELTS_V8 + c] *= cst;
+        }
+    }
+#endif
+}
+
+static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
+                                   OPJ_UINT32 end,
+                                   OPJ_UINT32 m,
+                                   OPJ_FLOAT32 cst)
+{
+    OPJ_UINT32 i;
+    OPJ_UINT32 imax = opj_uint_min(end, m);
+#ifdef __SSE__
+    __m128* vw = (__m128*) fw;
+    __m128 vcst = _mm_set1_ps(cst);
+    if (imax > 0) {
+        __m128* vl = (__m128*) fl;
+        vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
+        vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
+        vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+        i = 1;
+
+        for (; i < imax; ++i) {
+            vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
+            vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
+            vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+        }
+    }
+    if (m < end) {
+        assert(m + 1 == end);
+        vcst = _mm_add_ps(vcst, vcst);
+        vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
+        vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
+    }
+#else
+    OPJ_INT32 c;
+    if (imax > 0) {
+        for (c = 0; c < NB_ELTS_V8; c++) {
+            fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
+                                       cst;
+        }
+        fw += 2 * NB_ELTS_V8;
+        i = 1;
+        for (; i < imax; ++i) {
+            for (c = 0; c < NB_ELTS_V8; c++) {
+                fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
+                                           cst;
+            }
+            fw += 2 * NB_ELTS_V8;
+        }
+    }
+    if (m < end) {
+        assert(m + 1 == end);
+        for (c = 0; c < NB_ELTS_V8; c++) {
+            fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst;
+        }
+    }
+#endif
+}
+
+/* Forward 9-7 transform, for the vertical pass, processing cols columns */
+/* where cols <= NB_ELTS_V8 */
+static void opj_dwt_encode_and_deinterleave_v_real(
+    void *arrayIn,
+    void *tmpIn,
+    OPJ_UINT32 height,
+    OPJ_BOOL even,
+    OPJ_UINT32 stride_width,
+    OPJ_UINT32 cols)
+{
+    OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
+    OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
+    const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
+    const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
+    OPJ_INT32 a, b;
+
+    if (height == 1) {
+        return;
+    }
+
+    opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
+
+    if (even) {
+        a = 0;
+        b = 1;
+    } else {
+        a = 1;
+        b = 0;
+    }
+    opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
+                           tmp + (b + 1) * NB_ELTS_V8,
+                           (OPJ_UINT32)dn,
+                           (OPJ_UINT32)opj_int_min(dn, sn - b),
+                           opj_dwt_alpha);
+    opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
+                           tmp + (a + 1) * NB_ELTS_V8,
+                           (OPJ_UINT32)sn,
+                           (OPJ_UINT32)opj_int_min(sn, dn - a),
+                           opj_dwt_beta);
+    opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
+                           tmp + (b + 1) * NB_ELTS_V8,
+                           (OPJ_UINT32)dn,
+                           (OPJ_UINT32)opj_int_min(dn, sn - b),
+                           opj_dwt_gamma);
+    opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
+                           tmp + (a + 1) * NB_ELTS_V8,
+                           (OPJ_UINT32)sn,
+                           (OPJ_UINT32)opj_int_min(sn, dn - a),
+                           opj_dwt_delta);
+    opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn,
+                           opj_K);
+    opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn,
+                           opj_invK);
+
+
+    if (cols == NB_ELTS_V8) {
+        opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
+                                    (OPJ_INT32*)array,
+                                    (OPJ_INT32)dn, (OPJ_INT32)sn,
+                                    stride_width, even ? 0 : 1, NB_ELTS_V8);
+    } else {
+        opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
+                                    (OPJ_INT32*)array,
+                                    (OPJ_INT32)dn, (OPJ_INT32)sn,
+                                    stride_width, even ? 0 : 1, cols);
+    }
+}
+
 
 /* <summary>                            */
 /* Forward 5-3 wavelet transform in 2-D. */
 /* </summary>                           */
-static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
-        void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
+static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
+        opj_tcd_tilecomp_t * tilec,
+        opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
+        opj_encode_and_deinterleave_h_one_row_fnptr_type
+        p_encode_and_deinterleave_h_one_row)
 {
-    OPJ_INT32 i, j, k;
-    OPJ_INT32 *a = 00;
-    OPJ_INT32 *aj = 00;
+    OPJ_INT32 i;
     OPJ_INT32 *bj = 00;
-    OPJ_INT32 w, l;
+    OPJ_UINT32 w;
+    OPJ_INT32 l;
 
-    OPJ_INT32 rw;           /* width of the resolution level computed   */
-    OPJ_INT32 rh;           /* height of the resolution level computed  */
     OPJ_SIZE_T l_data_size;
 
     opj_tcd_resolution_t * l_cur_res = 0;
     opj_tcd_resolution_t * l_last_res = 0;
+    const int num_threads = opj_thread_pool_get_thread_count(tp);
+    OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
 
-    w = tilec->x1 - tilec->x0;
+    w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
     l = (OPJ_INT32)tilec->numresolutions - 1;
-    a = tilec->data;
 
     l_cur_res = tilec->resolutions + l;
     l_last_res = l_cur_res - 1;
 
     l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
     /* overflow check */
-    if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
+    if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
         /* FIXME event manager error callback */
         return OPJ_FALSE;
     }
-    l_data_size *= sizeof(OPJ_INT32);
-    bj = (OPJ_INT32*)opj_malloc(l_data_size);
+    l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
+    bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
     /* in that case, so do not error out */
     if (l_data_size != 0 && ! bj) {
@@ -1145,43 +1754,135 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
     i = l;
 
     while (i--) {
-        OPJ_INT32 rw1;      /* width of the resolution level once lower than computed one                                       */
-        OPJ_INT32 rh1;      /* height of the resolution level once lower than computed one                                      */
+        OPJ_UINT32 j;
+        OPJ_UINT32 rw;           /* width of the resolution level computed   */
+        OPJ_UINT32 rh;           /* height of the resolution level computed  */
+        OPJ_UINT32
+        rw1;      /* width of the resolution level once lower than computed one                                       */
+        OPJ_UINT32
+        rh1;      /* height of the resolution level once lower than computed one                                      */
         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
         OPJ_INT32 dn, sn;
 
-        rw  = l_cur_res->x1 - l_cur_res->x0;
-        rh  = l_cur_res->y1 - l_cur_res->y0;
-        rw1 = l_last_res->x1 - l_last_res->x0;
-        rh1 = l_last_res->y1 - l_last_res->y0;
+        rw  = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
+        rh  = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
+        rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
+        rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
 
         cas_row = l_cur_res->x0 & 1;
         cas_col = l_cur_res->y0 & 1;
 
-        sn = rh1;
-        dn = rh - rh1;
-        for (j = 0; j < rw; ++j) {
-            aj = a + j;
-            for (k = 0; k < rh; ++k) {
-                bj[k] = aj[k * w];
+        sn = (OPJ_INT32)rh1;
+        dn = (OPJ_INT32)(rh - rh1);
+
+        /* Perform vertical pass */
+        if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
+            for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
+                p_encode_and_deinterleave_v(tiledp + j,
+                                            bj,
+                                            rh,
+                                            cas_col == 0,
+                                            w,
+                                            NB_ELTS_V8);
             }
+            if (j < rw) {
+                p_encode_and_deinterleave_v(tiledp + j,
+                                            bj,
+                                            rh,
+                                            cas_col == 0,
+                                            w,
+                                            rw - j);
+            }
+        }  else {
+            OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
+            OPJ_UINT32 step_j;
 
-            (*p_function)(bj, dn, sn, cas_col);
+            if (rw < num_jobs) {
+                num_jobs = rw;
+            }
+            step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
 
-            opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
+            for (j = 0; j < num_jobs; j++) {
+                opj_dwt_encode_v_job_t* job;
+
+                job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
+                if (!job) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_aligned_free(bj);
+                    return OPJ_FALSE;
+                }
+                job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
+                if (!job->v.mem) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_free(job);
+                    opj_aligned_free(bj);
+                    return OPJ_FALSE;
+                }
+                job->v.dn = dn;
+                job->v.sn = sn;
+                job->v.cas = cas_col;
+                job->rh = rh;
+                job->w = w;
+                job->tiledp = tiledp;
+                job->min_j = j * step_j;
+                job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
+                job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
+                opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
+            }
+            opj_thread_pool_wait_completion(tp, 0);
         }
 
-        sn = rw1;
-        dn = rw - rw1;
+        sn = (OPJ_INT32)rw1;
+        dn = (OPJ_INT32)(rw - rw1);
 
-        for (j = 0; j < rh; j++) {
-            aj = a + j * w;
-            for (k = 0; k < rw; k++) {
-                bj[k] = aj[k];
+        /* Perform horizontal pass */
+        if (num_threads <= 1 || rh <= 1) {
+            for (j = 0; j < rh; j++) {
+                OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
+                (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
+                                                       cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
             }
-            (*p_function)(bj, dn, sn, cas_row);
-            opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
+        }  else {
+            OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
+            OPJ_UINT32 step_j;
+
+            if (rh < num_jobs) {
+                num_jobs = rh;
+            }
+            step_j = (rh / num_jobs);
+
+            for (j = 0; j < num_jobs; j++) {
+                opj_dwt_encode_h_job_t* job;
+
+                job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
+                if (!job) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_aligned_free(bj);
+                    return OPJ_FALSE;
+                }
+                job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
+                if (!job->h.mem) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_free(job);
+                    opj_aligned_free(bj);
+                    return OPJ_FALSE;
+                }
+                job->h.dn = dn;
+                job->h.sn = sn;
+                job->h.cas = cas_row;
+                job->rw = rw;
+                job->w = w;
+                job->tiledp = tiledp;
+                job->min_j = j * step_j;
+                job->max_j = (j + 1U) * step_j; /* this can overflow */
+                if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
+                    job->max_j = rh;
+                }
+                job->p_function = p_encode_and_deinterleave_h_one_row;
+                opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
+            }
+            opj_thread_pool_wait_completion(tp, 0);
         }
 
         l_cur_res = l_last_res;
@@ -1189,15 +1890,18 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
         --l_last_res;
     }
 
-    opj_free(bj);
+    opj_aligned_free(bj);
     return OPJ_TRUE;
 }
 
 /* Forward 5-3 wavelet transform in 2-D. */
 /* </summary>                           */
-OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
+OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
+                        opj_tcd_tilecomp_t * tilec)
 {
-    return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
+    return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
+                                    opj_dwt_encode_and_deinterleave_v,
+                                    opj_dwt_encode_and_deinterleave_h_one_row);
 }
 
 /* <summary>                            */
@@ -1213,21 +1917,6 @@ OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
     }
 }
 
-
-/* <summary>                          */
-/* Get gain of 5-3 wavelet transform. */
-/* </summary>                         */
-OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
-{
-    if (orient == 0) {
-        return 0;
-    }
-    if (orient == 1 || orient == 2) {
-        return 1;
-    }
-    return 2;
-}
-
 /* <summary>                */
 /* Get norm of 5-3 wavelet. */
 /* </summary>               */
@@ -1247,18 +1936,12 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
 /* <summary>                             */
 /* Forward 9-7 wavelet transform in 2-D. */
 /* </summary>                            */
-OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
+OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
+                             opj_tcd_tilecomp_t * tilec)
 {
-    return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
-}
-
-/* <summary>                          */
-/* Get gain of 9-7 wavelet transform. */
-/* </summary>                         */
-OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
-{
-    (void)orient;
-    return 0;
+    return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
+                                    opj_dwt_encode_and_deinterleave_v_real,
+                                    opj_dwt_encode_and_deinterleave_h_one_row_real);
 }
 
 /* <summary>                */
@@ -1293,7 +1976,7 @@ void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
             stepsize = 1.0;
         } else {
-            OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
+            OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient);
             stepsize = (1 << (gain)) / norm;
         }
         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
@@ -1328,15 +2011,15 @@ typedef struct {
     OPJ_INT32 * OPJ_RESTRICT tiledp;
     OPJ_UINT32 min_j;
     OPJ_UINT32 max_j;
-} opj_dwd_decode_h_job_t;
+} opj_dwt_decode_h_job_t;
 
 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
 {
     OPJ_UINT32 j;
-    opj_dwd_decode_h_job_t* job;
+    opj_dwt_decode_h_job_t* job;
     (void)tls;
 
-    job = (opj_dwd_decode_h_job_t*)user_data;
+    job = (opj_dwt_decode_h_job_t*)user_data;
     for (j = job->min_j; j < job->max_j; j++) {
         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
     }
@@ -1352,15 +2035,15 @@ typedef struct {
     OPJ_INT32 * OPJ_RESTRICT tiledp;
     OPJ_UINT32 min_j;
     OPJ_UINT32 max_j;
-} opj_dwd_decode_v_job_t;
+} opj_dwt_decode_v_job_t;
 
 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
 {
     OPJ_UINT32 j;
-    opj_dwd_decode_v_job_t* job;
+    opj_dwt_decode_v_job_t* job;
     (void)tls;
 
-    job = (opj_dwd_decode_v_job_t*)user_data;
+    job = (opj_dwt_decode_v_job_t*)user_data;
     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
             j += PARALLEL_COLS_53) {
         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
@@ -1447,9 +2130,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
             step_j = (rh / num_jobs);
 
             for (j = 0; j < num_jobs; j++) {
-                opj_dwd_decode_h_job_t* job;
+                opj_dwt_decode_h_job_t* job;
 
-                job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
+                job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
                 if (!job) {
                     /* It would be nice to fallback to single thread case, but */
                     /* unfortunately some jobs may be launched and have modified */
@@ -1502,9 +2185,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
             step_j = (rw / num_jobs);
 
             for (j = 0; j < num_jobs; j++) {
-                opj_dwd_decode_v_job_t* job;
+                opj_dwt_decode_v_job_t* job;
 
-                job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
+                job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
                 if (!job) {
                     /* It would be nice to fallback to single thread case, but */
                     /* unfortunately some jobs may be launched and have modified */
@@ -2168,7 +2851,7 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
     return OPJ_TRUE;
 }
 
-static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
+static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
                                    OPJ_FLOAT32* OPJ_RESTRICT a,
                                    OPJ_UINT32 width,
                                    OPJ_UINT32 remaining_height)
@@ -2179,39 +2862,69 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
     OPJ_UINT32 x1 = dwt->win_l_x1;
 
     for (k = 0; k < 2; ++k) {
-        if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
-                ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) {
+        if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
+                ((OPJ_SIZE_T) bi & 0x0f) == 0) {
             /* Fast code path */
             for (i = x0; i < x1; ++i) {
                 OPJ_UINT32 j = i;
-                bi[i * 8    ] = a[j];
+                OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
+                dst[0] = a[j];
                 j += width;
-                bi[i * 8 + 1] = a[j];
+                dst[1] = a[j];
                 j += width;
-                bi[i * 8 + 2] = a[j];
+                dst[2] = a[j];
                 j += width;
-                bi[i * 8 + 3] = a[j];
+                dst[3] = a[j];
+                j += width;
+                dst[4] = a[j];
+                j += width;
+                dst[5] = a[j];
+                j += width;
+                dst[6] = a[j];
+                j += width;
+                dst[7] = a[j];
             }
         } else {
             /* Slow code path */
             for (i = x0; i < x1; ++i) {
                 OPJ_UINT32 j = i;
-                bi[i * 8    ] = a[j];
+                OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
+                dst[0] = a[j];
                 j += width;
                 if (remaining_height == 1) {
                     continue;
                 }
-                bi[i * 8 + 1] = a[j];
+                dst[1] = a[j];
                 j += width;
                 if (remaining_height == 2) {
                     continue;
                 }
-                bi[i * 8 + 2] = a[j];
+                dst[2] = a[j];
                 j += width;
                 if (remaining_height == 3) {
                     continue;
                 }
-                bi[i * 8 + 3] = a[j]; /* This one*/
+                dst[3] = a[j];
+                j += width;
+                if (remaining_height == 4) {
+                    continue;
+                }
+                dst[4] = a[j];
+                j += width;
+                if (remaining_height == 5) {
+                    continue;
+                }
+                dst[5] = a[j];
+                j += width;
+                if (remaining_height == 6) {
+                    continue;
+                }
+                dst[6] = a[j];
+                j += width;
+                if (remaining_height == 7) {
+                    continue;
+                }
+                dst[7] = a[j];
             }
         }
 
@@ -2222,7 +2935,7 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
     }
 }
 
-static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
+static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
         opj_sparse_array_int32_t* sa,
         OPJ_UINT32 sa_line,
         OPJ_UINT32 remaining_height)
@@ -2235,25 +2948,25 @@ static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
                                           dwt->win_l_x1, sa_line + i + 1,
                                           /* Nasty cast from float* to int32* */
                                           (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
-                                          8, 0, OPJ_TRUE);
+                                          2 * NB_ELTS_V8, 0, OPJ_TRUE);
         assert(ret);
         ret = opj_sparse_array_int32_read(sa,
                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
                                           /* Nasty cast from float* to int32* */
                                           (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
-                                          8, 0, OPJ_TRUE);
+                                          2 * NB_ELTS_V8, 0, OPJ_TRUE);
         assert(ret);
         OPJ_UNUSED(ret);
     }
 }
 
-static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
-                                   OPJ_FLOAT32* OPJ_RESTRICT a,
-                                   OPJ_UINT32 width,
-                                   OPJ_UINT32 nb_elts_read)
+static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
+        OPJ_FLOAT32* OPJ_RESTRICT a,
+        OPJ_UINT32 width,
+        OPJ_UINT32 nb_elts_read)
 {
-    opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
+    opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
     OPJ_UINT32 i;
 
     for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
@@ -2270,7 +2983,7 @@ static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
     }
 }
 
-static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
+static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
         opj_sparse_array_int32_t* sa,
         OPJ_UINT32 sa_col,
         OPJ_UINT32 nb_elts_read)
@@ -2280,44 +2993,36 @@ static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
                                       sa_col, dwt->win_l_x0,
                                       sa_col + nb_elts_read, dwt->win_l_x1,
                                       (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
-                                      1, 8, OPJ_TRUE);
+                                      1, 2 * NB_ELTS_V8, OPJ_TRUE);
     assert(ret);
     ret = opj_sparse_array_int32_read(sa,
                                       sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
                                       sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
                                       (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
-                                      1, 8, OPJ_TRUE);
+                                      1, 2 * NB_ELTS_V8, OPJ_TRUE);
     assert(ret);
     OPJ_UNUSED(ret);
 }
 
 #ifdef __SSE__
 
-static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
+static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
                                        OPJ_UINT32 start,
                                        OPJ_UINT32 end,
                                        const __m128 c)
 {
     __m128* OPJ_RESTRICT vw = (__m128*) w;
-    OPJ_UINT32 i;
-    /* 4x unrolled loop */
-    vw += 2 * start;
-    for (i = start; i + 3 < end; i += 4, vw += 8) {
-        __m128 xmm0 = _mm_mul_ps(vw[0], c);
-        __m128 xmm2 = _mm_mul_ps(vw[2], c);
-        __m128 xmm4 = _mm_mul_ps(vw[4], c);
-        __m128 xmm6 = _mm_mul_ps(vw[6], c);
-        vw[0] = xmm0;
-        vw[2] = xmm2;
-        vw[4] = xmm4;
-        vw[6] = xmm6;
-    }
-    for (; i < end; ++i, vw += 2) {
+    OPJ_UINT32 i = start;
+    /* To be adapted if NB_ELTS_V8 changes */
+    vw += 4 * start;
+    /* Note: attempt at loop unrolling x2 doesn't help */
+    for (; i < end; ++i, vw += 4) {
         vw[0] = _mm_mul_ps(vw[0], c);
+        vw[1] = _mm_mul_ps(vw[1], c);
     }
 }
 
-static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
+static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
                                        OPJ_UINT32 start,
                                        OPJ_UINT32 end,
                                        OPJ_UINT32 m,
@@ -2325,74 +3030,58 @@ static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
 {
     __m128* OPJ_RESTRICT vl = (__m128*) l;
     __m128* OPJ_RESTRICT vw = (__m128*) w;
+    /* To be adapted if NB_ELTS_V8 changes */
     OPJ_UINT32 i;
     OPJ_UINT32 imax = opj_uint_min(end, m);
-    __m128 tmp1, tmp2, tmp3;
     if (start == 0) {
-        tmp1 = vl[0];
+        if (imax >= 1) {
+            vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
+            vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
+            vw += 4;
+            start = 1;
+        }
     } else {
-        vw += start * 2;
-        tmp1 = vw[-3];
+        vw += start * 4;
     }
 
     i = start;
-
-    /* 4x loop unrolling */
-    for (; i + 3 < imax; i += 4) {
-        __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
-        tmp2 = vw[-1];
-        tmp3 = vw[ 0];
-        tmp4 = vw[ 1];
-        tmp5 = vw[ 2];
-        tmp6 = vw[ 3];
-        tmp7 = vw[ 4];
-        tmp8 = vw[ 5];
-        tmp9 = vw[ 6];
-        vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
-        vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
-        vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
-        vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
-        tmp1 = tmp9;
-        vw += 8;
-    }
-
+    /* Note: attempt at loop unrolling x2 doesn't help */
     for (; i < imax; ++i) {
-        tmp2 = vw[-1];
-        tmp3 = vw[ 0];
-        vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
-        tmp1 = tmp3;
-        vw += 2;
+        vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
+        vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
+        vw += 4;
     }
     if (m < end) {
         assert(m + 1 == end);
         c = _mm_add_ps(c, c);
-        c = _mm_mul_ps(c, vw[-2]);
-        vw[-1] = _mm_add_ps(vw[-1], c);
+        vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
+        vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
     }
 }
 
 #else
 
-static void opj_v4dwt_decode_step1(opj_v4_t* w,
+static void opj_v8dwt_decode_step1(opj_v8_t* w,
                                    OPJ_UINT32 start,
                                    OPJ_UINT32 end,
                                    const OPJ_FLOAT32 c)
 {
     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
     OPJ_UINT32 i;
+    /* To be adapted if NB_ELTS_V8 changes */
     for (i = start; i < end; ++i) {
-        OPJ_FLOAT32 tmp1 = fw[i * 8    ];
-        OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
-        OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
-        OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
-        fw[i * 8    ] = tmp1 * c;
-        fw[i * 8 + 1] = tmp2 * c;
-        fw[i * 8 + 2] = tmp3 * c;
-        fw[i * 8 + 3] = tmp4 * c;
+        fw[i * 2 * 8    ] = fw[i * 2 * 8    ] * c;
+        fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
+        fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
+        fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
+        fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
+        fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
+        fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
+        fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
     }
 }
 
-static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
+static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
                                    OPJ_UINT32 start,
                                    OPJ_UINT32 end,
                                    OPJ_UINT32 m,
@@ -2403,36 +3092,33 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
     OPJ_UINT32 i;
     OPJ_UINT32 imax = opj_uint_min(end, m);
     if (start > 0) {
-        fw += 8 * start;
-        fl = fw - 8;
+        fw += 2 * NB_ELTS_V8 * start;
+        fl = fw - 2 * NB_ELTS_V8;
     }
+    /* To be adapted if NB_ELTS_V8 changes */
     for (i = start; i < imax; ++i) {
-        OPJ_FLOAT32 tmp1_1 = fl[0];
-        OPJ_FLOAT32 tmp1_2 = fl[1];
-        OPJ_FLOAT32 tmp1_3 = fl[2];
-        OPJ_FLOAT32 tmp1_4 = fl[3];
-        OPJ_FLOAT32 tmp2_1 = fw[-4];
-        OPJ_FLOAT32 tmp2_2 = fw[-3];
-        OPJ_FLOAT32 tmp2_3 = fw[-2];
-        OPJ_FLOAT32 tmp2_4 = fw[-1];
-        OPJ_FLOAT32 tmp3_1 = fw[0];
-        OPJ_FLOAT32 tmp3_2 = fw[1];
-        OPJ_FLOAT32 tmp3_3 = fw[2];
-        OPJ_FLOAT32 tmp3_4 = fw[3];
-        fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
-        fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
-        fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
-        fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
+        fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
+        fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
+        fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
+        fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
+        fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
+        fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
+        fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
+        fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
         fl = fw;
-        fw += 8;
+        fw += 2 * NB_ELTS_V8;
     }
     if (m < end) {
         assert(m + 1 == end);
         c += c;
-        fw[-4] = fw[-4] + fl[0] * c;
-        fw[-3] = fw[-3] + fl[1] * c;
-        fw[-2] = fw[-2] + fl[2] * c;
-        fw[-1] = fw[-1] + fl[3] * c;
+        fw[-8] = fw[-8] + fl[0] * c;
+        fw[-7] = fw[-7] + fl[1] * c;
+        fw[-6] = fw[-6] + fl[2] * c;
+        fw[-5] = fw[-5] + fl[3] * c;
+        fw[-4] = fw[-4] + fl[4] * c;
+        fw[-3] = fw[-3] + fl[5] * c;
+        fw[-2] = fw[-2] + fl[6] * c;
+        fw[-1] = fw[-1] + fl[7] * c;
     }
 }
 
@@ -2441,9 +3127,17 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
-static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
+static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
 {
     OPJ_INT32 a, b;
+    /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
+    /* Historic value for 2 / opj_invK */
+    /* Normally, we should use invK, but if we do so, we have failures in the */
+    /* conformance test, due to MSE and peak errors significantly higher than */
+    /* accepted value */
+    /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
+    /* the computation of the stepsize for the non LL subbands */
+    const float two_invK = 1.625732422f;
     if (dwt->cas == 0) {
         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
             return;
@@ -2458,60 +3152,147 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
         b = 0;
     }
 #ifdef __SSE__
-    opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
+    opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
                                _mm_set1_ps(opj_K));
-    opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
-                               _mm_set1_ps(opj_c13318));
-    opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
+    opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
+                               _mm_set1_ps(two_invK));
+    opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
                                dwt->win_l_x0, dwt->win_l_x1,
                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-                               _mm_set1_ps(opj_dwt_delta));
-    opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
+                               _mm_set1_ps(-opj_dwt_delta));
+    opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
                                dwt->win_h_x0, dwt->win_h_x1,
                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-                               _mm_set1_ps(opj_dwt_gamma));
-    opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
+                               _mm_set1_ps(-opj_dwt_gamma));
+    opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
                                dwt->win_l_x0, dwt->win_l_x1,
                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-                               _mm_set1_ps(opj_dwt_beta));
-    opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
+                               _mm_set1_ps(-opj_dwt_beta));
+    opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
                                dwt->win_h_x0, dwt->win_h_x1,
                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-                               _mm_set1_ps(opj_dwt_alpha));
+                               _mm_set1_ps(-opj_dwt_alpha));
 #else
-    opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
+    opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
                            opj_K);
-    opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
-                           opj_c13318);
-    opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
+    opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
+                           two_invK);
+    opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
                            dwt->win_l_x0, dwt->win_l_x1,
                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-                           opj_dwt_delta);
-    opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
+                           -opj_dwt_delta);
+    opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
                            dwt->win_h_x0, dwt->win_h_x1,
                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-                           opj_dwt_gamma);
-    opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
+                           -opj_dwt_gamma);
+    opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
                            dwt->win_l_x0, dwt->win_l_x1,
                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-                           opj_dwt_beta);
-    opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
+                           -opj_dwt_beta);
+    opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
                            dwt->win_h_x0, dwt->win_h_x1,
                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-                           opj_dwt_alpha);
+                           -opj_dwt_alpha);
 #endif
 }
 
+typedef struct {
+    opj_v8dwt_t h;
+    OPJ_UINT32 rw;
+    OPJ_UINT32 w;
+    OPJ_FLOAT32 * OPJ_RESTRICT aj;
+    OPJ_UINT32 nb_rows;
+} opj_dwt97_decode_h_job_t;
+
+static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
+{
+    OPJ_UINT32 j;
+    opj_dwt97_decode_h_job_t* job;
+    OPJ_FLOAT32 * OPJ_RESTRICT aj;
+    OPJ_UINT32 w;
+    (void)tls;
+
+    job = (opj_dwt97_decode_h_job_t*)user_data;
+    w = job->w;
+
+    assert((job->nb_rows % NB_ELTS_V8) == 0);
+
+    aj = job->aj;
+    for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
+        OPJ_UINT32 k;
+        opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
+        opj_v8dwt_decode(&job->h);
+
+        /* To be adapted if NB_ELTS_V8 changes */
+        for (k = 0; k < job->rw; k++) {
+            aj[k      ] = job->h.wavelet[k].f[0];
+            aj[k + (OPJ_SIZE_T)w  ] = job->h.wavelet[k].f[1];
+            aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
+            aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
+        }
+        for (k = 0; k < job->rw; k++) {
+            aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
+            aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
+            aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
+            aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
+        }
+
+        aj += w * NB_ELTS_V8;
+    }
+
+    opj_aligned_free(job->h.wavelet);
+    opj_free(job);
+}
+
+
+typedef struct {
+    opj_v8dwt_t v;
+    OPJ_UINT32 rh;
+    OPJ_UINT32 w;
+    OPJ_FLOAT32 * OPJ_RESTRICT aj;
+    OPJ_UINT32 nb_columns;
+} opj_dwt97_decode_v_job_t;
+
+static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
+{
+    OPJ_UINT32 j;
+    opj_dwt97_decode_v_job_t* job;
+    OPJ_FLOAT32 * OPJ_RESTRICT aj;
+    (void)tls;
+
+    job = (opj_dwt97_decode_v_job_t*)user_data;
+
+    assert((job->nb_columns % NB_ELTS_V8) == 0);
+
+    aj = job->aj;
+    for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
+        OPJ_UINT32 k;
+
+        opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
+        opj_v8dwt_decode(&job->v);
+
+        for (k = 0; k < job->rh; ++k) {
+            memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
+                   NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
+        }
+        aj += NB_ELTS_V8;
+    }
+
+    opj_aligned_free(job->v.wavelet);
+    opj_free(job);
+}
+
 
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 2-D. */
 /* </summary>                            */
 static
-OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
+OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
+                                opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
                                 OPJ_UINT32 numres)
 {
-    opj_v4dwt_t h;
-    opj_v4dwt_t v;
+    opj_v8dwt_t h;
+    opj_v8dwt_t v;
 
     opj_tcd_resolution_t* res = tilec->resolutions;
 
@@ -2525,20 +3306,19 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
 
     OPJ_SIZE_T l_data_size;
+    const int num_threads = opj_thread_pool_get_thread_count(tp);
+
+    if (numres == 1) {
+        return OPJ_TRUE;
+    }
 
     l_data_size = opj_dwt_max_resolution(res, numres);
     /* overflow check */
-    if (l_data_size > (SIZE_MAX - 5U)) {
+    if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
         /* FIXME event manager error callback */
         return OPJ_FALSE;
     }
-    l_data_size += 5U;
-    /* overflow check */
-    if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
-        /* FIXME event manager error callback */
-        return OPJ_FALSE;
-    }
-    h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
+    h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
     if (!h.wavelet) {
         /* FIXME event manager error callback */
         return OPJ_FALSE;
@@ -2566,35 +3346,80 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
         h.win_l_x1 = (OPJ_UINT32)h.sn;
         h.win_h_x0 = 0;
         h.win_h_x1 = (OPJ_UINT32)h.dn;
-        for (j = 0; j + 3 < rh; j += 4) {
-            OPJ_UINT32 k;
-            opj_v4dwt_interleave_h(&h, aj, w, rh - j);
-            opj_v4dwt_decode(&h);
 
-            for (k = 0; k < rw; k++) {
-                aj[k      ] = h.wavelet[k].f[0];
-                aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
-                aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
-                aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
+        if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
+            for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
+                OPJ_UINT32 k;
+                opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
+                opj_v8dwt_decode(&h);
+
+                /* To be adapted if NB_ELTS_V8 changes */
+                for (k = 0; k < rw; k++) {
+                    aj[k      ] = h.wavelet[k].f[0];
+                    aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
+                    aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
+                    aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
+                }
+                for (k = 0; k < rw; k++) {
+                    aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
+                    aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
+                    aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
+                    aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
+                }
+
+                aj += w * NB_ELTS_V8;
             }
+        } else {
+            OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
+            OPJ_UINT32 step_j;
 
-            aj += w * 4;
+            if ((rh / NB_ELTS_V8) < num_jobs) {
+                num_jobs = rh / NB_ELTS_V8;
+            }
+            step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
+            for (j = 0; j < num_jobs; j++) {
+                opj_dwt97_decode_h_job_t* job;
+
+                job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
+                if (!job) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_aligned_free(h.wavelet);
+                    return OPJ_FALSE;
+                }
+                job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
+                if (!job->h.wavelet) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_free(job);
+                    opj_aligned_free(h.wavelet);
+                    return OPJ_FALSE;
+                }
+                job->h.dn = h.dn;
+                job->h.sn = h.sn;
+                job->h.cas = h.cas;
+                job->h.win_l_x0 = h.win_l_x0;
+                job->h.win_l_x1 = h.win_l_x1;
+                job->h.win_h_x0 = h.win_h_x0;
+                job->h.win_h_x1 = h.win_h_x1;
+                job->rw = rw;
+                job->w = w;
+                job->aj = aj;
+                job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
+                                                      (NB_ELTS_V8 - 1)) - j * step_j : step_j;
+                aj += w * job->nb_rows;
+                opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
+            }
+            opj_thread_pool_wait_completion(tp, 0);
+            j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
         }
 
         if (j < rh) {
             OPJ_UINT32 k;
-            opj_v4dwt_interleave_h(&h, aj, w, rh - j);
-            opj_v4dwt_decode(&h);
+            opj_v8dwt_interleave_h(&h, aj, w, rh - j);
+            opj_v8dwt_decode(&h);
             for (k = 0; k < rw; k++) {
-                switch (rh - j) {
-                case 3:
-                    aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
-                /* FALLTHRU */
-                case 2:
-                    aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
-                /* FALLTHRU */
-                case 1:
-                    aj[k] = h.wavelet[k].f[0];
+                OPJ_UINT32 l;
+                for (l = 0; l < rh - j; l++) {
+                    aj[k + (OPJ_SIZE_T)w  * l ] = h.wavelet[k].f[l];
                 }
             }
         }
@@ -2607,25 +3432,71 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
         v.win_h_x1 = (OPJ_UINT32)v.dn;
 
         aj = (OPJ_FLOAT32*) tilec->data;
-        for (j = rw; j > 3; j -= 4) {
-            OPJ_UINT32 k;
+        if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
+            for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
+                OPJ_UINT32 k;
 
-            opj_v4dwt_interleave_v(&v, aj, w, 4);
-            opj_v4dwt_decode(&v);
+                opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
+                opj_v8dwt_decode(&v);
 
-            for (k = 0; k < rh; ++k) {
-                memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
+                for (k = 0; k < rh; ++k) {
+                    memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
+                }
+                aj += NB_ELTS_V8;
             }
-            aj += 4;
+        } else {
+            /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
+                transfer being the limiting factor. So limit the number of
+                threads.
+             */
+            OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
+            OPJ_UINT32 step_j;
+
+            if ((rw / NB_ELTS_V8) < num_jobs) {
+                num_jobs = rw / NB_ELTS_V8;
+            }
+            step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
+            for (j = 0; j < num_jobs; j++) {
+                opj_dwt97_decode_v_job_t* job;
+
+                job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
+                if (!job) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_aligned_free(h.wavelet);
+                    return OPJ_FALSE;
+                }
+                job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
+                if (!job->v.wavelet) {
+                    opj_thread_pool_wait_completion(tp, 0);
+                    opj_free(job);
+                    opj_aligned_free(h.wavelet);
+                    return OPJ_FALSE;
+                }
+                job->v.dn = v.dn;
+                job->v.sn = v.sn;
+                job->v.cas = v.cas;
+                job->v.win_l_x0 = v.win_l_x0;
+                job->v.win_l_x1 = v.win_l_x1;
+                job->v.win_h_x0 = v.win_h_x0;
+                job->v.win_h_x1 = v.win_h_x1;
+                job->rh = rh;
+                job->w = w;
+                job->aj = aj;
+                job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
+                                  (NB_ELTS_V8 - 1)) - j * step_j : step_j;
+                aj += job->nb_columns;
+                opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
+            }
+            opj_thread_pool_wait_completion(tp, 0);
         }
 
-        if (rw & 0x03) {
+        if (rw & (NB_ELTS_V8 - 1)) {
             OPJ_UINT32 k;
 
-            j = rw & 0x03;
+            j = rw & (NB_ELTS_V8 - 1);
 
-            opj_v4dwt_interleave_v(&v, aj, w, j);
-            opj_v4dwt_decode(&v);
+            opj_v8dwt_interleave_v(&v, aj, w, j);
+            opj_v8dwt_decode(&v);
 
             for (k = 0; k < rh; ++k) {
                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
@@ -2643,8 +3514,8 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
                                    OPJ_UINT32 numres)
 {
     opj_sparse_array_int32_t* sa;
-    opj_v4dwt_t h;
-    opj_v4dwt_t v;
+    opj_v8dwt_t h;
+    opj_v8dwt_t v;
     OPJ_UINT32 resno;
     /* This value matches the maximum left/right extension given in tables */
     /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
@@ -2694,19 +3565,12 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
 
     l_data_size = opj_dwt_max_resolution(tr, numres);
     /* overflow check */
-    if (l_data_size > (SIZE_MAX - 5U)) {
+    if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
         /* FIXME event manager error callback */
         opj_sparse_array_int32_free(sa);
         return OPJ_FALSE;
     }
-    l_data_size += 5U;
-    /* overflow check */
-    if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
-        /* FIXME event manager error callback */
-        opj_sparse_array_int32_free(sa);
-        return OPJ_FALSE;
-    }
-    h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
+    h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
     if (!h.wavelet) {
         /* FIXME event manager error callback */
         opj_sparse_array_int32_free(sa);
@@ -2801,17 +3665,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
         h.win_l_x1 = win_ll_x1;
         h.win_h_x0 = win_hl_x0;
         h.win_h_x1 = win_hl_x1;
-        for (j = 0; j + 3 < rh; j += 4) {
-            if ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
-                    (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
+        for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
+            if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
+                    (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
                      j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
-                opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j));
-                opj_v4dwt_decode(&h);
+                opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
+                opj_v8dwt_decode(&h);
                 if (!opj_sparse_array_int32_write(sa,
                                                   win_tr_x0, j,
-                                                  win_tr_x1, j + 4,
+                                                  win_tr_x1, j + NB_ELTS_V8,
                                                   (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
-                                                  4, 1, OPJ_TRUE)) {
+                                                  NB_ELTS_V8, 1, OPJ_TRUE)) {
                     /* FIXME event manager error callback */
                     opj_sparse_array_int32_free(sa);
                     opj_aligned_free(h.wavelet);
@@ -2821,16 +3685,16 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
         }
 
         if (j < rh &&
-                ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
-                 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
+                ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
+                 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
                   j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
-            opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j);
-            opj_v4dwt_decode(&h);
+            opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
+            opj_v8dwt_decode(&h);
             if (!opj_sparse_array_int32_write(sa,
                                               win_tr_x0, j,
                                               win_tr_x1, rh,
                                               (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
-                                              4, 1, OPJ_TRUE)) {
+                                              NB_ELTS_V8, 1, OPJ_TRUE)) {
                 /* FIXME event manager error callback */
                 opj_sparse_array_int32_free(sa);
                 opj_aligned_free(h.wavelet);
@@ -2842,17 +3706,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
         v.win_l_x1 = win_ll_y1;
         v.win_h_x0 = win_lh_y0;
         v.win_h_x1 = win_lh_y1;
-        for (j = win_tr_x0; j < win_tr_x1; j += 4) {
-            OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j);
+        for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
+            OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
 
-            opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts);
-            opj_v4dwt_decode(&v);
+            opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
+            opj_v8dwt_decode(&v);
 
             if (!opj_sparse_array_int32_write(sa,
                                               j, win_tr_y0,
                                               j + nb_elts, win_tr_y1,
                                               (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
-                                              1, 4, OPJ_TRUE)) {
+                                              1, NB_ELTS_V8, OPJ_TRUE)) {
                 /* FIXME event manager error callback */
                 opj_sparse_array_int32_free(sa);
                 opj_aligned_free(h.wavelet);
@@ -2885,7 +3749,7 @@ OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
                              OPJ_UINT32 numres)
 {
     if (p_tcd->whole_tile_decoding) {
-        return opj_dwt_decode_tile_97(tilec, numres);
+        return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
     } else {
         return opj_dwt_decode_partial_97(tilec, numres);
     }
diff --git a/3rdparty/openjpeg/openjp2/dwt.h b/3rdparty/openjpeg/openjp2/dwt.h
index 4f63e524a6..215061e6b9 100644
--- a/3rdparty/openjpeg/openjp2/dwt.h
+++ b/3rdparty/openjpeg/openjp2/dwt.h
@@ -56,9 +56,11 @@ DWT.C are used by some function in TCD.C.
 /**
 Forward 5-3 wavelet transform in 2-D.
 Apply a reversible DWT transform to a component of an image.
+@param p_tcd TCD handle
 @param tilec Tile component information (current tile)
 */
-OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec);
+OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
+                        opj_tcd_tilecomp_t * tilec);
 
 /**
 Inverse 5-3 wavelet transform in 2-D.
@@ -71,12 +73,6 @@ OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd,
                         opj_tcd_tilecomp_t* tilec,
                         OPJ_UINT32 numres);
 
-/**
-Get the gain of a subband for the reversible 5-3 DWT.
-@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH)
-@return Returns 0 if orient = 0, returns 1 if orient = 1 or 2, returns 2 otherwise
-*/
-OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) ;
 /**
 Get the norm of a wavelet function of a subband at a specified level for the reversible 5-3 DWT.
 @param level Level of the wavelet function
@@ -87,9 +83,11 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient);
 /**
 Forward 9-7 wavelet transform in 2-D.
 Apply an irreversible DWT transform to a component of an image.
+@param p_tcd TCD handle
 @param tilec Tile component information (current tile)
 */
-OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec);
+OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
+                             opj_tcd_tilecomp_t * tilec);
 /**
 Inverse 9-7 wavelet transform in 2-D.
 Apply an irreversible inverse DWT transform to a component of an image.
@@ -101,12 +99,6 @@ OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
                              opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
                              OPJ_UINT32 numres);
 
-/**
-Get the gain of a subband for the irreversible 9-7 DWT.
-@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH)
-@return Returns the gain of the 9-7 wavelet transform
-*/
-OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient);
 /**
 Get the norm of a wavelet function of a subband at a specified level for the irreversible 9-7 DWT
 @param level Level of the wavelet function
diff --git a/3rdparty/openjpeg/openjp2/j2k.c b/3rdparty/openjpeg/openjp2/j2k.c
index 4169cd672b..8e343ab2e3 100644
--- a/3rdparty/openjpeg/openjp2/j2k.c
+++ b/3rdparty/openjpeg/openjp2/j2k.c
@@ -400,14 +400,14 @@ static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k,
 static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
         OPJ_BYTE * p_data,
         OPJ_UINT32 * p_data_written,
-        OPJ_UINT32 p_total_data_size,
+        OPJ_UINT32 total_data_size,
         opj_stream_private_t *p_stream,
         struct opj_event_mgr * p_manager);
 
 static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
         OPJ_BYTE * p_data,
         OPJ_UINT32 * p_data_written,
-        OPJ_UINT32 p_total_data_size,
+        OPJ_UINT32 total_data_size,
         opj_stream_private_t *p_stream,
         struct opj_event_mgr * p_manager);
 
@@ -832,14 +832,14 @@ static OPJ_BOOL opj_j2k_write_tlm(opj_j2k_t *p_j2k,
  *
  * @param       p_j2k            J2K codec.
  * @param       p_data           Output buffer
- * @param       p_total_data_size Output buffer size
+ * @param       total_data_size  Output buffer size
  * @param       p_data_written   Number of bytes written into stream
  * @param       p_stream         the stream to write data to.
  * @param       p_manager        the user event manager.
 */
 static OPJ_BOOL opj_j2k_write_sot(opj_j2k_t *p_j2k,
                                   OPJ_BYTE * p_data,
-                                  OPJ_UINT32 p_total_data_size,
+                                  OPJ_UINT32 total_data_size,
                                   OPJ_UINT32 * p_data_written,
                                   const opj_stream_private_t *p_stream,
                                   opj_event_mgr_t * p_manager);
@@ -879,11 +879,13 @@ static OPJ_BOOL opj_j2k_read_sot(opj_j2k_t *p_j2k,
 /**
  * Writes the SOD marker (Start of data)
  *
+ * This also writes optional PLT markers (before SOD)
+ *
  * @param       p_j2k               J2K codec.
  * @param       p_tile_coder        FIXME DOC
  * @param       p_data              FIXME DOC
  * @param       p_data_written      FIXME DOC
- * @param       p_total_data_size   FIXME DOC
+ * @param       total_data_size   FIXME DOC
  * @param       p_stream            the stream to write data to.
  * @param       p_manager           the user event manager.
 */
@@ -891,7 +893,7 @@ static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k,
                                   opj_tcd_t * p_tile_coder,
                                   OPJ_BYTE * p_data,
                                   OPJ_UINT32 * p_data_written,
-                                  OPJ_UINT32 p_total_data_size,
+                                  OPJ_UINT32 total_data_size,
                                   const opj_stream_private_t *p_stream,
                                   opj_event_mgr_t * p_manager);
 
@@ -1219,6 +1221,7 @@ static OPJ_BOOL opj_j2k_write_epc(opj_j2k_t *p_j2k,
  * A nice message is outputted at errors.
  *
  * @param       p_pocs                  the progression order changes.
+ * @param       tileno                  the tile number of interest
  * @param       p_nb_pocs               the number of progression order changes.
  * @param       p_nb_resolutions        the number of resolutions.
  * @param       numcomps                the number of components
@@ -1228,6 +1231,7 @@ static OPJ_BOOL opj_j2k_write_epc(opj_j2k_t *p_j2k,
  * @return      true if the pocs are valid.
  */
 static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs,
+                                      OPJ_UINT32 tileno,
                                       OPJ_UINT32 p_nb_pocs,
                                       OPJ_UINT32 p_nb_resolutions,
                                       OPJ_UINT32 numcomps,
@@ -1282,6 +1286,13 @@ static void opj_j2k_set_cinema_parameters(opj_cparameters_t *parameters,
 static OPJ_BOOL opj_j2k_is_cinema_compliant(opj_image_t *image, OPJ_UINT16 rsiz,
         opj_event_mgr_t *p_manager);
 
+static void opj_j2k_set_imf_parameters(opj_cparameters_t *parameters,
+                                       opj_image_t *image, opj_event_mgr_t *p_manager);
+
+static OPJ_BOOL opj_j2k_is_imf_compliant(opj_cparameters_t *parameters,
+        opj_image_t *image,
+        opj_event_mgr_t *p_manager);
+
 /**
  * Checks for invalid number of tile-parts in SOT marker (TPsot==TNsot). See issue 254.
  *
@@ -1615,6 +1626,7 @@ const char *opj_j2k_convert_progression_order(OPJ_PROG_ORDER prg_order)
 }
 
 static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs,
+                                      OPJ_UINT32 tileno,
                                       OPJ_UINT32 p_nb_pocs,
                                       OPJ_UINT32 p_nb_resolutions,
                                       OPJ_UINT32 p_num_comps,
@@ -1628,7 +1640,8 @@ static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs,
     OPJ_UINT32 step_r = p_num_comps * step_c;
     OPJ_UINT32 step_l = p_nb_resolutions * step_r;
     OPJ_BOOL loss = OPJ_FALSE;
-    OPJ_UINT32 layno0 = 0;
+
+    assert(p_nb_pocs > 0);
 
     packet_array = (OPJ_UINT32*) opj_calloc(step_l * p_num_layers,
                                             sizeof(OPJ_UINT32));
@@ -1638,63 +1651,37 @@ static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs,
         return OPJ_FALSE;
     }
 
-    if (p_nb_pocs == 0) {
-        opj_free(packet_array);
-        return OPJ_TRUE;
-    }
+    /* iterate through all the pocs that match our tile of interest. */
+    for (i = 0; i < p_nb_pocs; ++i) {
+        const opj_poc_t *poc = &p_pocs[i];
+        if (tileno + 1 == poc->tile) {
+            index = step_r * poc->resno0;
 
-    index = step_r * p_pocs->resno0;
-    /* take each resolution for each poc */
-    for (resno = p_pocs->resno0 ; resno < p_pocs->resno1 ; ++resno) {
-        OPJ_UINT32 res_index = index + p_pocs->compno0 * step_c;
+            /* take each resolution for each poc */
+            for (resno = poc->resno0 ;
+                    resno < opj_uint_min(poc->resno1, p_nb_resolutions); ++resno) {
+                OPJ_UINT32 res_index = index + poc->compno0 * step_c;
 
-        /* take each comp of each resolution for each poc */
-        for (compno = p_pocs->compno0 ; compno < p_pocs->compno1 ; ++compno) {
-            OPJ_UINT32 comp_index = res_index + layno0 * step_l;
+                /* take each comp of each resolution for each poc */
+                for (compno = poc->compno0 ;
+                        compno < opj_uint_min(poc->compno1, p_num_comps); ++compno) {
+                    /* The layer index always starts at zero for every progression. */
+                    const OPJ_UINT32 layno0 = 0;
+                    OPJ_UINT32 comp_index = res_index + layno0 * step_l;
 
-            /* and finally take each layer of each res of ... */
-            for (layno = layno0; layno < p_pocs->layno1 ; ++layno) {
-                /*index = step_r * resno + step_c * compno + step_l * layno;*/
-                packet_array[comp_index] = 1;
-                comp_index += step_l;
-            }
+                    /* and finally take each layer of each res of ... */
+                    for (layno = layno0; layno < opj_uint_min(poc->layno1, p_num_layers);
+                            ++layno) {
+                        packet_array[comp_index] = 1;
+                        comp_index += step_l;
+                    }
 
-            res_index += step_c;
-        }
-
-        index += step_r;
-    }
-    ++p_pocs;
-
-    /* iterate through all the pocs */
-    for (i = 1; i < p_nb_pocs ; ++i) {
-        OPJ_UINT32 l_last_layno1 = (p_pocs - 1)->layno1 ;
-
-        layno0 = (p_pocs->layno1 > l_last_layno1) ? l_last_layno1 : 0;
-        index = step_r * p_pocs->resno0;
-
-        /* take each resolution for each poc */
-        for (resno = p_pocs->resno0 ; resno < p_pocs->resno1 ; ++resno) {
-            OPJ_UINT32 res_index = index + p_pocs->compno0 * step_c;
-
-            /* take each comp of each resolution for each poc */
-            for (compno = p_pocs->compno0 ; compno < p_pocs->compno1 ; ++compno) {
-                OPJ_UINT32 comp_index = res_index + layno0 * step_l;
-
-                /* and finally take each layer of each res of ... */
-                for (layno = layno0; layno < p_pocs->layno1 ; ++layno) {
-                    /*index = step_r * resno + step_c * compno + step_l * layno;*/
-                    packet_array[comp_index] = 1;
-                    comp_index += step_l;
+                    res_index += step_c;
                 }
 
-                res_index += step_c;
+                index += step_r;
             }
-
-            index += step_r;
         }
-
-        ++p_pocs;
     }
 
     index = 0;
@@ -1702,7 +1689,13 @@ static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs,
         for (resno = 0; resno < p_nb_resolutions; ++resno) {
             for (compno = 0; compno < p_num_comps; ++compno) {
                 loss |= (packet_array[index] != 1);
-                /*index = step_r * resno + step_c * compno + step_l * layno;*/
+#ifdef DEBUG_VERBOSE
+                if (packet_array[index] != 1) {
+                    fprintf(stderr,
+                            "Missing packet in POC: layno=%d resno=%d compno=%d\n",
+                            layno, resno, compno);
+                }
+#endif
                 index += step_c;
             }
         }
@@ -2714,6 +2707,12 @@ static OPJ_BOOL opj_j2k_read_cod(opj_j2k_t *p_j2k,
     opj_read_bytes(p_header_data, &l_tcp->mct, 1);          /* SGcod (C) */
     ++p_header_data;
 
+    if (l_tcp->mct > 1) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Invalid multiple component transformation\n");
+        return OPJ_FALSE;
+    }
+
     p_header_size -= 5;
     for (i = 0; i < l_image->numcomps; ++i) {
         l_tcp->tccps[i].csty = l_tcp->csty & J2K_CCP_CSTY_PRT;
@@ -3452,6 +3451,28 @@ static OPJ_UINT32 opj_j2k_get_specific_header_sizes(opj_j2k_t *p_j2k)
 
     l_nb_bytes += opj_j2k_get_max_poc_size(p_j2k);
 
+    if (p_j2k->m_specific_param.m_encoder.m_PLT) {
+        /* Reserve space for PLT markers */
+
+        OPJ_UINT32 i;
+        const opj_cp_t * l_cp = &(p_j2k->m_cp);
+        OPJ_UINT32 l_max_packet_count = 0;
+        for (i = 0; i < l_cp->th * l_cp->tw; ++i) {
+            l_max_packet_count = opj_uint_max(l_max_packet_count,
+                                              opj_get_encoding_packet_count(p_j2k->m_private_image, l_cp, i));
+        }
+        /* Minimum 6 bytes per PLT marker, and at a minimum (taking a pessimistic */
+        /* estimate of 4 bytes for a packet size), one can write */
+        /* (65536-6) / 4 = 16382 paquet sizes per PLT marker */
+        p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT =
+            6 * opj_uint_ceildiv(l_max_packet_count, 16382);
+        /* Maximum 5 bytes per packet to encode a full UINT32 */
+        p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT +=
+            l_nb_bytes += 5 * l_max_packet_count;
+        p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT += 1;
+        l_nb_bytes += p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT;
+    }
+
     /*** DEVELOPER CORNER, Add room for your headers ***/
 
     return l_nb_bytes;
@@ -4205,7 +4226,7 @@ static OPJ_BOOL opj_j2k_write_tlm(opj_j2k_t *p_j2k,
 
 static OPJ_BOOL opj_j2k_write_sot(opj_j2k_t *p_j2k,
                                   OPJ_BYTE * p_data,
-                                  OPJ_UINT32 p_total_data_size,
+                                  OPJ_UINT32 total_data_size,
                                   OPJ_UINT32 * p_data_written,
                                   const opj_stream_private_t *p_stream,
                                   opj_event_mgr_t * p_manager
@@ -4218,7 +4239,7 @@ static OPJ_BOOL opj_j2k_write_sot(opj_j2k_t *p_j2k,
 
     OPJ_UNUSED(p_stream);
 
-    if (p_total_data_size < 12) {
+    if (total_data_size < 12) {
         opj_event_msg(p_manager, EVT_ERROR,
                       "Not enough bytes in output buffer to write SOT marker\n");
         return OPJ_FALSE;
@@ -4611,17 +4632,105 @@ static OPJ_BOOL opj_j2k_read_sot(opj_j2k_t *p_j2k,
     return OPJ_TRUE;
 }
 
+/**
+ * Write one or more PLT markers in the provided buffer
+ */
+static OPJ_BOOL opj_j2k_write_plt_in_memory(opj_j2k_t *p_j2k,
+        opj_tcd_marker_info_t* marker_info,
+        OPJ_BYTE * p_data,
+        OPJ_UINT32 * p_data_written,
+        opj_event_mgr_t * p_manager)
+{
+    OPJ_BYTE Zplt = 0;
+    OPJ_UINT16 Lplt;
+    OPJ_BYTE* p_data_start = p_data;
+    OPJ_BYTE* p_data_Lplt = p_data + 2;
+    OPJ_UINT32 i;
+
+    OPJ_UNUSED(p_j2k);
+
+    opj_write_bytes(p_data, J2K_MS_PLT, 2);
+    p_data += 2;
+
+    /* Reserve space for Lplt */
+    p_data += 2;
+
+    opj_write_bytes(p_data, Zplt, 1);
+    p_data += 1;
+
+    Lplt = 3;
+
+    for (i = 0; i < marker_info->packet_count; i++) {
+        OPJ_BYTE var_bytes[5];
+        OPJ_UINT8 var_bytes_size = 0;
+        OPJ_UINT32 packet_size = marker_info->p_packet_size[i];
+
+        /* Packet size written in variable-length way, starting with LSB */
+        var_bytes[var_bytes_size] = (OPJ_BYTE)(packet_size & 0x7f);
+        var_bytes_size ++;
+        packet_size >>= 7;
+        while (packet_size > 0) {
+            var_bytes[var_bytes_size] = (OPJ_BYTE)((packet_size & 0x7f) | 0x80);
+            var_bytes_size ++;
+            packet_size >>= 7;
+        }
+
+        /* Check if that can fit in the current PLT marker. If not, finish */
+        /* current one, and start a new one */
+        if (Lplt + var_bytes_size > 65535) {
+            if (Zplt == 255) {
+                opj_event_msg(p_manager, EVT_ERROR,
+                              "More than 255 PLT markers would be needed for current tile-part !\n");
+                return OPJ_FALSE;
+            }
+
+            /* Patch Lplt */
+            opj_write_bytes(p_data_Lplt, Lplt, 2);
+
+            /* Start new segment */
+            opj_write_bytes(p_data, J2K_MS_PLT, 2);
+            p_data += 2;
+
+            /* Reserve space for Lplt */
+            p_data_Lplt = p_data;
+            p_data += 2;
+
+            Zplt ++;
+            opj_write_bytes(p_data, Zplt, 1);
+            p_data += 1;
+
+            Lplt = 3;
+        }
+
+        Lplt = (OPJ_UINT16)(Lplt + var_bytes_size);
+
+        /* Serialize variable-length packet size, starting with MSB */
+        for (; var_bytes_size > 0; --var_bytes_size) {
+            opj_write_bytes(p_data, var_bytes[var_bytes_size - 1], 1);
+            p_data += 1;
+        }
+    }
+
+    *p_data_written = (OPJ_UINT32)(p_data - p_data_start);
+
+    /* Patch Lplt */
+    opj_write_bytes(p_data_Lplt, Lplt, 2);
+
+    return OPJ_TRUE;
+}
+
 static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k,
                                   opj_tcd_t * p_tile_coder,
                                   OPJ_BYTE * p_data,
                                   OPJ_UINT32 * p_data_written,
-                                  OPJ_UINT32 p_total_data_size,
+                                  OPJ_UINT32 total_data_size,
                                   const opj_stream_private_t *p_stream,
                                   opj_event_mgr_t * p_manager
                                  )
 {
     opj_codestream_info_t *l_cstr_info = 00;
     OPJ_UINT32 l_remaining_data;
+    opj_tcd_marker_info_t* marker_info = NULL;
 
     /* preconditions */
     assert(p_j2k != 00);
@@ -4630,7 +4739,7 @@ static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k,
 
     OPJ_UNUSED(p_stream);
 
-    if (p_total_data_size < 4) {
+    if (total_data_size < 4) {
         opj_event_msg(p_manager, EVT_ERROR,
                       "Not enough bytes in output buffer to write SOD marker\n");
         return OPJ_FALSE;
@@ -4638,10 +4747,9 @@ static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k,
 
     opj_write_bytes(p_data, J2K_MS_SOD,
                     2);                                 /* SOD */
-    p_data += 2;
 
     /* make room for the EOF marker */
-    l_remaining_data =  p_total_data_size - 4;
+    l_remaining_data =  total_data_size - 4;
 
     /* update tile coder */
     p_tile_coder->tp_num =
@@ -4688,15 +4796,69 @@ static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k,
 
     *p_data_written = 0;
 
-    if (! opj_tcd_encode_tile(p_tile_coder, p_j2k->m_current_tile_number, p_data,
+    if (p_j2k->m_specific_param.m_encoder.m_PLT) {
+        marker_info = opj_tcd_marker_info_create(
+                          p_j2k->m_specific_param.m_encoder.m_PLT);
+        if (marker_info == NULL) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "Cannot encode tile: opj_tcd_marker_info_create() failed\n");
+            return OPJ_FALSE;
+        }
+    }
+
+    if (l_remaining_data <
+            p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Not enough bytes in output buffer to write SOD marker\n");
+        opj_tcd_marker_info_destroy(marker_info);
+        return OPJ_FALSE;
+    }
+    l_remaining_data -= p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT;
+
+    if (! opj_tcd_encode_tile(p_tile_coder, p_j2k->m_current_tile_number,
+                              p_data + 2,
                               p_data_written, l_remaining_data, l_cstr_info,
+                              marker_info,
                               p_manager)) {
         opj_event_msg(p_manager, EVT_ERROR, "Cannot encode tile\n");
+        opj_tcd_marker_info_destroy(marker_info);
         return OPJ_FALSE;
     }
 
+    /* For SOD */
     *p_data_written += 2;
 
+    if (p_j2k->m_specific_param.m_encoder.m_PLT) {
+        OPJ_UINT32 l_data_written_PLT = 0;
+        OPJ_BYTE* p_PLT_buffer = (OPJ_BYTE*)opj_malloc(
+                                     p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT);
+        if (!p_PLT_buffer) {
+            opj_event_msg(p_manager, EVT_ERROR, "Cannot allocate memory\n");
+            opj_tcd_marker_info_destroy(marker_info);
+            return OPJ_FALSE;
+        }
+        if (!opj_j2k_write_plt_in_memory(p_j2k,
+                                         marker_info,
+                                         p_PLT_buffer,
+                                         &l_data_written_PLT,
+                                         p_manager)) {
+            opj_tcd_marker_info_destroy(marker_info);
+            opj_free(p_PLT_buffer);
+            return OPJ_FALSE;
+        }
+
+        assert(l_data_written_PLT <=
+               p_j2k->m_specific_param.m_encoder.m_reserved_bytes_for_PLT);
+
+        /* Move PLT marker(s) before SOD */
+        memmove(p_data + l_data_written_PLT, p_data, *p_data_written);
+        memcpy(p_data, p_PLT_buffer, l_data_written_PLT);
+        opj_free(p_PLT_buffer);
+        *p_data_written += l_data_written_PLT;
+    }
+
+    opj_tcd_marker_info_destroy(marker_info);
+
     return OPJ_TRUE;
 }
 
@@ -5046,7 +5208,7 @@ static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k,
     OPJ_FLOAT32 * l_rates = 0;
     OPJ_FLOAT32 l_sot_remove;
     OPJ_UINT32 l_bits_empty, l_size_pixel;
-    OPJ_UINT32 l_tile_size = 0;
+    OPJ_UINT64 l_tile_size = 0;
     OPJ_UINT32 l_last_res;
     OPJ_FLOAT32(* l_tp_stride_func)(opj_tcp_t *) = 00;
 
@@ -5090,25 +5252,12 @@ static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k,
             l_rates = l_tcp->rates;
 
             /* Modification of the RATE >> */
-            if (*l_rates > 0.0f) {
-                *l_rates = (((OPJ_FLOAT32)(l_size_pixel * (OPJ_UINT32)(l_x1 - l_x0) *
-                                           (OPJ_UINT32)(l_y1 - l_y0)))
-                            /
-                            ((*l_rates) * (OPJ_FLOAT32)l_bits_empty)
-                           )
-                           -
-                           l_offset;
-            }
-
-            ++l_rates;
-
-            for (k = 1; k < l_tcp->numlayers; ++k) {
+            for (k = 0; k < l_tcp->numlayers; ++k) {
                 if (*l_rates > 0.0f) {
-                    *l_rates = (((OPJ_FLOAT32)(l_size_pixel * (OPJ_UINT32)(l_x1 - l_x0) *
-                                               (OPJ_UINT32)(l_y1 - l_y0)))
-                                /
-                                ((*l_rates) * (OPJ_FLOAT32)l_bits_empty)
-                               )
+                    *l_rates = (OPJ_FLOAT32)(((OPJ_FLOAT64)l_size_pixel * (OPJ_UINT32)(
+                                                  l_x1 - l_x0) *
+                                              (OPJ_UINT32)(l_y1 - l_y0))
+                                             / ((*l_rates) * (OPJ_FLOAT32)l_bits_empty))
                                -
                                l_offset;
                 }
@@ -5168,12 +5317,11 @@ static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k,
     l_tile_size = 0;
 
     for (i = 0; i < l_image->numcomps; ++i) {
-        l_tile_size += (opj_uint_ceildiv(l_cp->tdx, l_img_comp->dx)
-                        *
-                        opj_uint_ceildiv(l_cp->tdy, l_img_comp->dy)
-                        *
-                        l_img_comp->prec
-                       );
+        l_tile_size += (OPJ_UINT64)opj_uint_ceildiv(l_cp->tdx, l_img_comp->dx)
+                       *
+                       opj_uint_ceildiv(l_cp->tdy, l_img_comp->dy)
+                       *
+                       l_img_comp->prec;
 
         ++l_img_comp;
     }
@@ -5184,7 +5332,7 @@ static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k,
     /* bin/test_tile_encoder 1 256 256 32 32 8 0 reversible_with_precinct.j2k 4 4 3 0 0 1 16 16 */
     /* TODO revise this to take into account the overhead linked to the */
     /* number of packets and number of code blocks in packets */
-    l_tile_size = (OPJ_UINT32)(l_tile_size * 1.4 / 8);
+    l_tile_size = (OPJ_UINT64)((double)l_tile_size * 1.4 / 8);
 
     /* Arbitrary amount to make the following work: */
     /* bin/test_tile_encoder 1 256 256 17 16 8 0 reversible_no_precinct.j2k 4 4 3 0 0 1 */
@@ -5192,14 +5340,21 @@ static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k,
 
     l_tile_size += opj_j2k_get_specific_header_sizes(p_j2k);
 
-    p_j2k->m_specific_param.m_encoder.m_encoded_tile_size = l_tile_size;
+    if (l_tile_size > UINT_MAX) {
+        l_tile_size = UINT_MAX;
+    }
+
+    p_j2k->m_specific_param.m_encoder.m_encoded_tile_size = (OPJ_UINT32)l_tile_size;
     p_j2k->m_specific_param.m_encoder.m_encoded_tile_data =
         (OPJ_BYTE *) opj_malloc(p_j2k->m_specific_param.m_encoder.m_encoded_tile_size);
     if (p_j2k->m_specific_param.m_encoder.m_encoded_tile_data == 00) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Not enough memory to allocate m_encoded_tile_data. %u MB required\n",
+                      (OPJ_UINT32)(l_tile_size / 1024 / 1024));
         return OPJ_FALSE;
     }
 
-    if (OPJ_IS_CINEMA(l_cp->rsiz)) {
+    if (OPJ_IS_CINEMA(l_cp->rsiz) || OPJ_IS_IMF(l_cp->rsiz)) {
         p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer =
             (OPJ_BYTE *) opj_malloc(5 *
                                     p_j2k->m_specific_param.m_encoder.m_total_tile_parts);
@@ -6627,7 +6782,7 @@ static void opj_j2k_set_cinema_parameters(opj_cparameters_t *parameters,
     }
 
     /* Precincts */
-    parameters->csty |= 0x01;
+    parameters->csty |= J2K_CP_CSTY_PRT;
     if (parameters->numresolution == 1) {
         parameters->res_spec = 1;
         parameters->prcw_init[0] = 128;
@@ -6753,6 +6908,589 @@ static OPJ_BOOL opj_j2k_is_cinema_compliant(opj_image_t *image, OPJ_UINT16 rsiz,
     return OPJ_TRUE;
 }
 
+static int opj_j2k_get_imf_max_NL(opj_cparameters_t *parameters,
+                                  opj_image_t *image)
+{
+    /* Decomposition levels */
+    const OPJ_UINT16 rsiz = parameters->rsiz;
+    const OPJ_UINT16 profile = OPJ_GET_IMF_PROFILE(rsiz);
+    const OPJ_UINT32 XTsiz = parameters->tile_size_on ? (OPJ_UINT32)
+                             parameters->cp_tdx : image->x1;
+    switch (profile) {
+    case OPJ_PROFILE_IMF_2K:
+        return 5;
+    case OPJ_PROFILE_IMF_4K:
+        return 6;
+    case OPJ_PROFILE_IMF_8K:
+        return 7;
+    case OPJ_PROFILE_IMF_2K_R: {
+        if (XTsiz >= 2048) {
+            return 5;
+        } else if (XTsiz >= 1024) {
+            return 4;
+        }
+        break;
+    }
+    case OPJ_PROFILE_IMF_4K_R: {
+        if (XTsiz >= 4096) {
+            return 6;
+        } else if (XTsiz >= 2048) {
+            return 5;
+        } else if (XTsiz >= 1024) {
+            return 4;
+        }
+        break;
+    }
+    case OPJ_PROFILE_IMF_8K_R: {
+        if (XTsiz >= 8192) {
+            return 7;
+        } else if (XTsiz >= 4096) {
+            return 6;
+        } else if (XTsiz >= 2048) {
+            return 5;
+        } else if (XTsiz >= 1024) {
+            return 4;
+        }
+        break;
+    }
+    default:
+        break;
+    }
+    return -1;
+}
+
+static void opj_j2k_set_imf_parameters(opj_cparameters_t *parameters,
+                                       opj_image_t *image, opj_event_mgr_t *p_manager)
+{
+    const OPJ_UINT16 rsiz = parameters->rsiz;
+    const OPJ_UINT16 profile = OPJ_GET_IMF_PROFILE(rsiz);
+
+    OPJ_UNUSED(p_manager);
+
+    /* Override defaults set by opj_set_default_encoder_parameters */
+    if (parameters->cblockw_init == OPJ_COMP_PARAM_DEFAULT_CBLOCKW &&
+            parameters->cblockh_init == OPJ_COMP_PARAM_DEFAULT_CBLOCKH) {
+        parameters->cblockw_init = 32;
+        parameters->cblockh_init = 32;
+    }
+
+    /* One tile part for each component */
+    parameters->tp_flag = 'C';
+    parameters->tp_on = 1;
+
+    if (parameters->prog_order == OPJ_COMP_PARAM_DEFAULT_PROG_ORDER) {
+        parameters->prog_order = OPJ_CPRL;
+    }
+
+    if (profile == OPJ_PROFILE_IMF_2K ||
+            profile == OPJ_PROFILE_IMF_4K ||
+            profile == OPJ_PROFILE_IMF_8K) {
+        /* 9-7 transform */
+        parameters->irreversible = 1;
+    }
+
+    /* Adjust the number of resolutions if set to its defaults */
+    if (parameters->numresolution == OPJ_COMP_PARAM_DEFAULT_NUMRESOLUTION &&
+            image->x0 == 0 &&
+            image->y0 == 0) {
+        const int max_NL = opj_j2k_get_imf_max_NL(parameters, image);
+        if (max_NL >= 0 && parameters->numresolution > max_NL) {
+            parameters->numresolution = max_NL + 1;
+        }
+
+        /* Note: below is generic logic */
+        if (!parameters->tile_size_on) {
+            while (parameters->numresolution > 0) {
+                if (image->x1 < (1U << ((OPJ_UINT32)parameters->numresolution - 1U))) {
+                    parameters->numresolution --;
+                    continue;
+                }
+                if (image->y1 < (1U << ((OPJ_UINT32)parameters->numresolution - 1U))) {
+                    parameters->numresolution --;
+                    continue;
+                }
+                break;
+            }
+        }
+    }
+
+    /* Set defaults precincts */
+    if (parameters->csty == 0) {
+        parameters->csty |= J2K_CP_CSTY_PRT;
+        if (parameters->numresolution == 1) {
+            parameters->res_spec = 1;
+            parameters->prcw_init[0] = 128;
+            parameters->prch_init[0] = 128;
+        } else {
+            int i;
+            parameters->res_spec = parameters->numresolution - 1;
+            for (i = 0; i < parameters->res_spec; i++) {
+                parameters->prcw_init[i] = 256;
+                parameters->prch_init[i] = 256;
+            }
+        }
+    }
+}
+
+/* Table A.53 from JPEG2000 standard */
+static const OPJ_UINT16 tabMaxSubLevelFromMainLevel[] = {
+    15, /* unspecified */
+    1,
+    1,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9
+};
+
+static OPJ_BOOL opj_j2k_is_imf_compliant(opj_cparameters_t *parameters,
+        opj_image_t *image,
+        opj_event_mgr_t *p_manager)
+{
+    OPJ_UINT32 i;
+    const OPJ_UINT16 rsiz = parameters->rsiz;
+    const OPJ_UINT16 profile = OPJ_GET_IMF_PROFILE(rsiz);
+    const OPJ_UINT16 mainlevel = OPJ_GET_IMF_MAINLEVEL(rsiz);
+    const OPJ_UINT16 sublevel = OPJ_GET_IMF_SUBLEVEL(rsiz);
+    const int NL = parameters->numresolution - 1;
+    const OPJ_UINT32 XTsiz = parameters->tile_size_on ? (OPJ_UINT32)
+                             parameters->cp_tdx : image->x1;
+    OPJ_BOOL ret = OPJ_TRUE;
+
+    /* Validate mainlevel */
+    if (mainlevel > OPJ_IMF_MAINLEVEL_MAX) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile require mainlevel <= 11.\n"
+                      "-> %d is thus not compliant\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      mainlevel);
+        ret = OPJ_FALSE;
+    }
+
+    /* Validate sublevel */
+    assert(sizeof(tabMaxSubLevelFromMainLevel) ==
+           (OPJ_IMF_MAINLEVEL_MAX + 1) * sizeof(tabMaxSubLevelFromMainLevel[0]));
+    if (sublevel > tabMaxSubLevelFromMainLevel[mainlevel]) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile require sublevel <= %d for mainlevel = %d.\n"
+                      "-> %d is thus not compliant\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      tabMaxSubLevelFromMainLevel[mainlevel],
+                      mainlevel,
+                      sublevel);
+        ret = OPJ_FALSE;
+    }
+
+    /* Number of components */
+    if (image->numcomps > 3) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profiles require at most 3 components.\n"
+                      "-> Number of components of input image (%d) is not compliant\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      image->numcomps);
+        ret = OPJ_FALSE;
+    }
+
+    if (image->x0 != 0 || image->y0 != 0) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profiles require image origin to be at 0,0.\n"
+                      "-> %d,%d is not compliant\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      image->x0, image->y0 != 0);
+        ret = OPJ_FALSE;
+    }
+
+    if (parameters->cp_tx0 != 0 || parameters->cp_ty0 != 0) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profiles require tile origin to be at 0,0.\n"
+                      "-> %d,%d is not compliant\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->cp_tx0, parameters->cp_ty0);
+        ret = OPJ_FALSE;
+    }
+
+    if (parameters->tile_size_on) {
+        if (profile == OPJ_PROFILE_IMF_2K ||
+                profile == OPJ_PROFILE_IMF_4K ||
+                profile == OPJ_PROFILE_IMF_8K) {
+            if ((OPJ_UINT32)parameters->cp_tdx < image->x1 ||
+                    (OPJ_UINT32)parameters->cp_tdy < image->y1) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 2K/4K/8K single tile profiles require tile to be greater or equal to image size.\n"
+                              "-> %d,%d is lesser than %d,%d\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              parameters->cp_tdx,
+                              parameters->cp_tdy,
+                              image->x1,
+                              image->y1);
+                ret = OPJ_FALSE;
+            }
+        } else {
+            if ((OPJ_UINT32)parameters->cp_tdx >= image->x1 &&
+                    (OPJ_UINT32)parameters->cp_tdy >= image->y1) {
+                /* ok */
+            } else if (parameters->cp_tdx == 1024 &&
+                       parameters->cp_tdy == 1024) {
+                /* ok */
+            } else if (parameters->cp_tdx == 2048 &&
+                       parameters->cp_tdy == 2048 &&
+                       (profile == OPJ_PROFILE_IMF_4K ||
+                        profile == OPJ_PROFILE_IMF_8K)) {
+                /* ok */
+            } else if (parameters->cp_tdx == 4096 &&
+                       parameters->cp_tdy == 4096 &&
+                       profile == OPJ_PROFILE_IMF_8K) {
+                /* ok */
+            } else {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 2K_R/4K_R/8K_R single/multiple tile profiles "
+                              "require tile to be greater or equal to image size,\n"
+                              "or to be (1024,1024), or (2048,2048) for 4K_R/8K_R "
+                              "or (4096,4096) for 8K_R.\n"
+                              "-> %d,%d is non conformant\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              parameters->cp_tdx,
+                              parameters->cp_tdy);
+                ret = OPJ_FALSE;
+            }
+        }
+    }
+
+    /* Bitdepth */
+    for (i = 0; i < image->numcomps; i++) {
+        if (!(image->comps[i].bpp >= 8 && image->comps[i].bpp <= 16) ||
+                (image->comps[i].sgnd)) {
+            char signed_str[] = "signed";
+            char unsigned_str[] = "unsigned";
+            char *tmp_str = image->comps[i].sgnd ? signed_str : unsigned_str;
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require precision of each component to b in [8-16] bits unsigned"
+                          "-> At least component %d of input image (%d bits, %s) is not compliant\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          i, image->comps[i].bpp, tmp_str);
+            ret = OPJ_FALSE;
+        }
+    }
+
+    /* Sub-sampling */
+    for (i = 0; i < image->numcomps; i++) {
+        if (i == 0 && image->comps[i].dx != 1) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require XRSiz1 == 1. Here it is set to %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[i].dx);
+            ret = OPJ_FALSE;
+        }
+        if (i == 1 && image->comps[i].dx != 1 && image->comps[i].dx != 2) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require XRSiz2 == 1 or 2. Here it is set to %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[i].dx);
+            ret = OPJ_FALSE;
+        }
+        if (i > 1 && image->comps[i].dx != image->comps[i - 1].dx) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require XRSiz%d to be the same as XRSiz2. "
+                          "Here it is set to %d instead of %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          i + 1, image->comps[i].dx, image->comps[i - 1].dx);
+            ret = OPJ_FALSE;
+        }
+        if (image->comps[i].dy != 1) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require YRsiz == 1. "
+                          "Here it is set to %d for component i.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[i].dy, i);
+            ret = OPJ_FALSE;
+        }
+    }
+
+    /* Image size */
+    switch (profile) {
+    case OPJ_PROFILE_IMF_2K:
+    case OPJ_PROFILE_IMF_2K_R:
+        if (((image->comps[0].w > 2048) | (image->comps[0].h > 1556))) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 2K/2K_R profile require:\n"
+                          "width <= 2048 and height <= 1556\n"
+                          "-> Input image size %d x %d is not compliant\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[0].w, image->comps[0].h);
+            ret = OPJ_FALSE;
+        }
+        break;
+    case OPJ_PROFILE_IMF_4K:
+    case OPJ_PROFILE_IMF_4K_R:
+        if (((image->comps[0].w > 4096) | (image->comps[0].h > 3112))) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 4K/4K_R profile require:\n"
+                          "width <= 4096 and height <= 3112\n"
+                          "-> Input image size %d x %d is not compliant\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[0].w, image->comps[0].h);
+            ret = OPJ_FALSE;
+        }
+        break;
+    case OPJ_PROFILE_IMF_8K:
+    case OPJ_PROFILE_IMF_8K_R:
+        if (((image->comps[0].w > 8192) | (image->comps[0].h > 6224))) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 8K/8K_R profile require:\n"
+                          "width <= 8192 and height <= 6224\n"
+                          "-> Input image size %d x %d is not compliant\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          image->comps[0].w, image->comps[0].h);
+            ret = OPJ_FALSE;
+        }
+        break;
+    default :
+        assert(0);
+        return OPJ_FALSE;
+    }
+
+    if (parameters->roi_compno != -1) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile forbid RGN / region of interest marker.\n"
+                      "-> Compression parameters specify a ROI\n"
+                      "-> Non-IMF codestream will be generated\n");
+        ret = OPJ_FALSE;
+    }
+
+    if (parameters->cblockw_init != 32 || parameters->cblockh_init != 32) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile require code block size to be 32x32.\n"
+                      "-> Compression parameters set it to %dx%d.\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->cblockw_init,
+                      parameters->cblockh_init);
+        ret = OPJ_FALSE;
+    }
+
+    if (parameters->prog_order != OPJ_CPRL) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile require progression order to be CPRL.\n"
+                      "-> Compression parameters set it to %d.\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->prog_order);
+        ret = OPJ_FALSE;
+    }
+
+    if (parameters->numpocs != 0) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile forbid POC markers.\n"
+                      "-> Compression parameters set %d POC.\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->numpocs);
+        ret = OPJ_FALSE;
+    }
+
+    /* Codeblock style: no mode switch enabled */
+    if (parameters->mode != 0) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF profile forbid mode switch in code block style.\n"
+                      "-> Compression parameters set code block style to %d.\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->mode);
+        ret = OPJ_FALSE;
+    }
+
+    if (profile == OPJ_PROFILE_IMF_2K ||
+            profile == OPJ_PROFILE_IMF_4K ||
+            profile == OPJ_PROFILE_IMF_8K) {
+        /* Expect 9-7 transform */
+        if (parameters->irreversible != 1) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 2K/4K/8K profiles require 9-7 Irreversible Transform.\n"
+                          "-> Compression parameters set it to reversible.\n"
+                          "-> Non-IMF codestream will be generated\n");
+            ret = OPJ_FALSE;
+        }
+    } else {
+        /* Expect 5-3 transform */
+        if (parameters->irreversible != 0) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 2K/4K/8K profiles require 5-3 reversible Transform.\n"
+                          "-> Compression parameters set it to irreversible.\n"
+                          "-> Non-IMF codestream will be generated\n");
+            ret = OPJ_FALSE;
+        }
+    }
+
+    /* Number of layers */
+    if (parameters->tcp_numlayers != 1) {
+        opj_event_msg(p_manager, EVT_WARNING,
+                      "IMF 2K/4K/8K profiles require 1 single quality layer.\n"
+                      "-> Number of layers is %d.\n"
+                      "-> Non-IMF codestream will be generated\n",
+                      parameters->tcp_numlayers);
+        ret = OPJ_FALSE;
+    }
+
+    /* Decomposition levels */
+    switch (profile) {
+    case OPJ_PROFILE_IMF_2K:
+        if (!(NL >= 1 && NL <= 5)) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 2K profile requires 1 <= NL <= 5:\n"
+                          "-> Number of decomposition levels is %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          NL);
+            ret = OPJ_FALSE;
+        }
+        break;
+    case OPJ_PROFILE_IMF_4K:
+        if (!(NL >= 1 && NL <= 6)) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 4K profile requires 1 <= NL <= 6:\n"
+                          "-> Number of decomposition levels is %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          NL);
+            ret = OPJ_FALSE;
+        }
+        break;
+    case OPJ_PROFILE_IMF_8K:
+        if (!(NL >= 1 && NL <= 7)) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF 8K profile requires 1 <= NL <= 7:\n"
+                          "-> Number of decomposition levels is %d.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          NL);
+            ret = OPJ_FALSE;
+        }
+        break;
+    case OPJ_PROFILE_IMF_2K_R: {
+        if (XTsiz >= 2048) {
+            if (!(NL >= 1 && NL <= 5)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 2K_R profile requires 1 <= NL <= 5 for XTsiz >= 2048:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 1024) {
+            if (!(NL >= 1 && NL <= 4)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 2K_R profile requires 1 <= NL <= 4 for XTsiz in [1024,2048[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        }
+        break;
+    }
+    case OPJ_PROFILE_IMF_4K_R: {
+        if (XTsiz >= 4096) {
+            if (!(NL >= 1 && NL <= 6)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 6 for XTsiz >= 4096:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 2048) {
+            if (!(NL >= 1 && NL <= 5)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 5 for XTsiz in [2048,4096[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 1024) {
+            if (!(NL >= 1 && NL <= 4)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 4 for XTsiz in [1024,2048[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        }
+        break;
+    }
+    case OPJ_PROFILE_IMF_8K_R: {
+        if (XTsiz >= 8192) {
+            if (!(NL >= 1 && NL <= 7)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 7 for XTsiz >= 8192:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 4096) {
+            if (!(NL >= 1 && NL <= 6)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 6 for XTsiz in [4096,8192[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 2048) {
+            if (!(NL >= 1 && NL <= 5)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 5 for XTsiz in [2048,4096[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        } else if (XTsiz >= 1024) {
+            if (!(NL >= 1 && NL <= 4)) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF 4K_R profile requires 1 <= NL <= 4 for XTsiz in [1024,2048[:\n"
+                              "-> Number of decomposition levels is %d.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        }
+        break;
+    }
+    default:
+        break;
+    }
+
+    if (parameters->numresolution == 1) {
+        if (parameters->res_spec != 1 ||
+                parameters->prcw_init[0] != 128 ||
+                parameters->prch_init[0] != 128) {
+            opj_event_msg(p_manager, EVT_WARNING,
+                          "IMF profiles require PPx = PPy = 7 for NLLL band, else 8.\n"
+                          "-> Supplied values are different from that.\n"
+                          "-> Non-IMF codestream will be generated\n",
+                          NL);
+            ret = OPJ_FALSE;
+        }
+    } else {
+        int i;
+        for (i = 0; i < parameters->res_spec; i++) {
+            if (parameters->prcw_init[i] != 256 ||
+                    parameters->prch_init[i] != 256) {
+                opj_event_msg(p_manager, EVT_WARNING,
+                              "IMF profiles require PPx = PPy = 7 for NLLL band, else 8.\n"
+                              "-> Supplied values are different from that.\n"
+                              "-> Non-IMF codestream will be generated\n",
+                              NL);
+                ret = OPJ_FALSE;
+            }
+        }
+    }
+
+    return ret;
+}
+
+
 OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
                                opj_cparameters_t *parameters,
                                opj_image_t *image,
@@ -6945,6 +7683,15 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
     } else {
         OPJ_FLOAT32 temp_rate;
         OPJ_BOOL cap = OPJ_FALSE;
+
+        if (OPJ_IS_IMF(parameters->rsiz) && parameters->max_cs_size > 0 &&
+                parameters->tcp_numlayers == 1 && parameters->tcp_rates[0] == 0) {
+            parameters->tcp_rates[0] = (OPJ_FLOAT32)(image->numcomps * image->comps[0].w *
+                                       image->comps[0].h * image->comps[0].prec) /
+                                       (OPJ_FLOAT32)(((OPJ_UINT32)parameters->max_cs_size) * 8 * image->comps[0].dx *
+                                               image->comps[0].dy);
+        }
+
         temp_rate = (OPJ_FLOAT32)(((double)image->numcomps * image->comps[0].w *
                                    image->comps[0].h * image->comps[0].prec) /
                                   (((double)parameters->max_cs_size) * 8 * image->comps[0].dx *
@@ -6985,9 +7732,10 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
                       "JPEG 2000 Broadcast profiles not yet supported\n");
         parameters->rsiz = OPJ_PROFILE_NONE;
     } else if (OPJ_IS_IMF(parameters->rsiz)) {
-        opj_event_msg(p_manager, EVT_WARNING,
-                      "JPEG 2000 IMF profiles not yet supported\n");
-        parameters->rsiz = OPJ_PROFILE_NONE;
+        opj_j2k_set_imf_parameters(parameters, image, p_manager);
+        if (!opj_j2k_is_imf_compliant(parameters, image, p_manager)) {
+            parameters->rsiz = OPJ_PROFILE_NONE;
+        }
     } else if (OPJ_IS_PART2(parameters->rsiz)) {
         if (parameters->rsiz == ((OPJ_PROFILE_PART2) | (OPJ_EXTENSION_NONE))) {
             opj_event_msg(p_manager, EVT_WARNING,
@@ -7079,6 +7827,14 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
     */
 
     if (parameters->tile_size_on) {
+        if (cp->tdx == 0) {
+            opj_event_msg(p_manager, EVT_ERROR, "Invalid tile width\n");
+            return OPJ_FALSE;
+        }
+        if (cp->tdy == 0) {
+            opj_event_msg(p_manager, EVT_ERROR, "Invalid tile height\n");
+            return OPJ_FALSE;
+        }
         cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->x1 - cp->tx0),
                                              (OPJ_INT32)cp->tdx);
         cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->y1 - cp->ty0),
@@ -7157,20 +7913,13 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
                       "Not enough memory to allocate tile coding parameters\n");
         return OPJ_FALSE;
     }
-    if (parameters->numpocs) {
-        /* initialisation of POC */
-        opj_j2k_check_poc_val(parameters->POC, parameters->numpocs,
-                              (OPJ_UINT32)parameters->numresolution, image->numcomps,
-                              (OPJ_UINT32)parameters->tcp_numlayers, p_manager);
-        /* TODO MSD use the return value*/
-    }
 
     for (tileno = 0; tileno < cp->tw * cp->th; tileno++) {
         opj_tcp_t *tcp = &cp->tcps[tileno];
         tcp->numlayers = (OPJ_UINT32)parameters->tcp_numlayers;
 
         for (j = 0; j < tcp->numlayers; j++) {
-            if (OPJ_IS_CINEMA(cp->rsiz)) {
+            if (OPJ_IS_CINEMA(cp->rsiz) || OPJ_IS_IMF(cp->rsiz)) {
                 if (cp->m_specific_param.m_enc.m_fixed_quality) {
                     tcp->distoratio[j] = parameters->tcp_distoratio[j];
                 }
@@ -7197,16 +7946,22 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
 
         if (parameters->numpocs) {
             /* initialisation of POC */
-            tcp->POC = 1;
             for (i = 0; i < parameters->numpocs; i++) {
                 if (tileno + 1 == parameters->POC[i].tile)  {
                     opj_poc_t *tcp_poc = &tcp->pocs[numpocs_tile];
 
+                    if (parameters->POC[numpocs_tile].compno0 >= image->numcomps) {
+                        opj_event_msg(p_manager, EVT_ERROR,
+                                      "Invalid compno0 for POC %d\n", i);
+                        return OPJ_FALSE;
+                    }
+
                     tcp_poc->resno0         = parameters->POC[numpocs_tile].resno0;
                     tcp_poc->compno0        = parameters->POC[numpocs_tile].compno0;
                     tcp_poc->layno1         = parameters->POC[numpocs_tile].layno1;
                     tcp_poc->resno1         = parameters->POC[numpocs_tile].resno1;
-                    tcp_poc->compno1        = parameters->POC[numpocs_tile].compno1;
+                    tcp_poc->compno1        = opj_uint_min(parameters->POC[numpocs_tile].compno1,
+                                                           image->numcomps);
                     tcp_poc->prg1           = parameters->POC[numpocs_tile].prg1;
                     tcp_poc->tile           = parameters->POC[numpocs_tile].tile;
 
@@ -7214,7 +7969,16 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
                 }
             }
 
-            tcp->numpocs = numpocs_tile - 1 ;
+            if (numpocs_tile) {
+
+                /* TODO MSD use the return value*/
+                opj_j2k_check_poc_val(parameters->POC, tileno, parameters->numpocs,
+                                      (OPJ_UINT32)parameters->numresolution, image->numcomps,
+                                      (OPJ_UINT32)parameters->tcp_numlayers, p_manager);
+
+                tcp->POC = 1;
+                tcp->numpocs = numpocs_tile - 1 ;
+            }
         } else {
             tcp->numpocs = 0;
         }
@@ -7542,6 +8306,8 @@ OPJ_BOOL opj_j2k_read_header(opj_stream_private_t *p_stream,
 
     /*Allocate and initialize some elements of codestrem index*/
     if (!opj_j2k_allocate_tile_element_cstr_index(p_j2k)) {
+        opj_image_destroy(*p_image);
+        *p_image = NULL;
         return OPJ_FALSE;
     }
 
@@ -8628,6 +9394,7 @@ OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k,
     OPJ_UINT32 l_marker_size;
     const opj_dec_memory_marker_handler_t * l_marker_handler = 00;
     opj_tcp_t * l_tcp = NULL;
+    const OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th;
 
     /* preconditions */
     assert(p_stream != 00);
@@ -8803,7 +9570,6 @@ OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k,
                     return OPJ_FALSE;
                 }
                 if (l_correction_needed) {
-                    OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th;
                     OPJ_UINT32 l_tile_no;
 
                     p_j2k->m_specific_param.m_decoder.m_can_decode = 0;
@@ -8818,27 +9584,42 @@ OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k,
                                   "Non conformant codestream TPsot==TNsot.\n");
                 }
             }
-            if (! p_j2k->m_specific_param.m_decoder.m_can_decode) {
-                /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */
-                if (opj_stream_read_data(p_stream,
-                                         p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) {
-                    opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n");
-                    return OPJ_FALSE;
-                }
-
-                /* Read 2 bytes from buffer as the new marker ID */
-                opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data,
-                               &l_current_marker, 2);
-            }
         } else {
             /* Indicate we will try to read a new tile-part header*/
             p_j2k->m_specific_param.m_decoder.m_skip_data = 0;
             p_j2k->m_specific_param.m_decoder.m_can_decode = 0;
             p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPHSOT;
+        }
 
+        if (! p_j2k->m_specific_param.m_decoder.m_can_decode) {
             /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */
             if (opj_stream_read_data(p_stream,
                                      p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) {
+
+                /* Deal with likely non conformant SPOT6 files, where the last */
+                /* row of tiles have TPsot == 0 and TNsot == 0, and missing EOC, */
+                /* but no other tile-parts were found. */
+                if (p_j2k->m_current_tile_number + 1 == l_nb_tiles) {
+                    OPJ_UINT32 l_tile_no;
+                    for (l_tile_no = 0U; l_tile_no < l_nb_tiles; ++l_tile_no) {
+                        if (p_j2k->m_cp.tcps[l_tile_no].m_current_tile_part_number == 0 &&
+                                p_j2k->m_cp.tcps[l_tile_no].m_nb_tile_parts == 0) {
+                            break;
+                        }
+                    }
+                    if (l_tile_no < l_nb_tiles) {
+                        opj_event_msg(p_manager, EVT_INFO,
+                                      "Tile %u has TPsot == 0 and TNsot == 0, "
+                                      "but no other tile-parts were found. "
+                                      "EOC is also missing.\n",
+                                      l_tile_no);
+                        p_j2k->m_current_tile_number = l_tile_no;
+                        l_current_marker = J2K_MS_EOC;
+                        p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_EOC;
+                        break;
+                    }
+                }
+
                 opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n");
                 return OPJ_FALSE;
             }
@@ -8857,9 +9638,8 @@ OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k,
         }
     }
 
-    /* FIXME DOC ???*/
+    /* Deal with tiles that have a single tile-part with TPsot == 0 and TNsot == 0 */
     if (! p_j2k->m_specific_param.m_decoder.m_can_decode) {
-        OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw;
         l_tcp = p_j2k->m_cp.tcps + p_j2k->m_current_tile_number;
 
         while ((p_j2k->m_current_tile_number < l_nb_tiles) && (l_tcp->m_data == 00)) {
@@ -9236,6 +10016,14 @@ static OPJ_BOOL opj_j2k_update_image_dimensions(opj_image_t* p_image,
     l_img_comp = p_image->comps;
     for (it_comp = 0; it_comp < p_image->numcomps; ++it_comp) {
         OPJ_INT32 l_h, l_w;
+        if (p_image->x0 > (OPJ_UINT32)INT_MAX ||
+                p_image->y0 > (OPJ_UINT32)INT_MAX ||
+                p_image->x1 > (OPJ_UINT32)INT_MAX ||
+                p_image->y1 > (OPJ_UINT32)INT_MAX) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "Image coordinates above INT_MAX are not supported\n");
+            return OPJ_FALSE;
+        }
 
         l_img_comp->x0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->x0,
                          (OPJ_INT32)l_img_comp->dx);
@@ -9754,9 +10542,9 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-    opj_read_bytes(l_current_ptr, &l_tccp->numresolutions,
-                   1);              /* SPcox (D) */
-    ++l_tccp->numresolutions;                                                                               /* tccp->numresolutions = read() + 1 */
+    /* SPcod (D) / SPcoc (A) */
+    opj_read_bytes(l_current_ptr, &l_tccp->numresolutions, 1);
+    ++l_tccp->numresolutions;  /* tccp->numresolutions = read() + 1 */
     if (l_tccp->numresolutions > OPJ_J2K_MAXRLVLS) {
         opj_event_msg(p_manager, EVT_ERROR,
                       "Invalid value for numresolutions : %d, max value is set in openjpeg.h at %d\n",
@@ -9777,11 +10565,13 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-    opj_read_bytes(l_current_ptr, &l_tccp->cblkw, 1);               /* SPcoc (E) */
+    /* SPcod (E) / SPcoc (B) */
+    opj_read_bytes(l_current_ptr, &l_tccp->cblkw, 1);
     ++l_current_ptr;
     l_tccp->cblkw += 2;
 
-    opj_read_bytes(l_current_ptr, &l_tccp->cblkh, 1);               /* SPcoc (F) */
+    /* SPcod (F) / SPcoc (C) */
+    opj_read_bytes(l_current_ptr, &l_tccp->cblkh, 1);
     ++l_current_ptr;
     l_tccp->cblkh += 2;
 
@@ -9792,8 +10582,8 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-
-    opj_read_bytes(l_current_ptr, &l_tccp->cblksty, 1);             /* SPcoc (G) */
+    /* SPcod (G) / SPcoc (D) */
+    opj_read_bytes(l_current_ptr, &l_tccp->cblksty, 1);
     ++l_current_ptr;
     if (l_tccp->cblksty & 0xC0U) { /* 2 msb are reserved, assume we can't read */
         opj_event_msg(p_manager, EVT_ERROR,
@@ -9801,9 +10591,16 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-    opj_read_bytes(l_current_ptr, &l_tccp->qmfbid, 1);              /* SPcoc (H) */
+    /* SPcod (H) / SPcoc (E) */
+    opj_read_bytes(l_current_ptr, &l_tccp->qmfbid, 1);
     ++l_current_ptr;
 
+    if (l_tccp->qmfbid > 1) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Error reading SPCod SPCoc element, Invalid transformation found\n");
+        return OPJ_FALSE;
+    }
+
     *p_header_size = *p_header_size - 5;
 
     /* use custom precinct size ? */
@@ -9813,8 +10610,9 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
             return OPJ_FALSE;
         }
 
+        /* SPcod (I_i) / SPcoc (F_i) */
         for (i = 0; i < l_tccp->numresolutions; ++i) {
-            opj_read_bytes(l_current_ptr, &l_tmp, 1);               /* SPcoc (I_i) */
+            opj_read_bytes(l_current_ptr, &l_tmp, 1);
             ++l_current_ptr;
             /* Precinct exponent 0 is only allowed for lowest resolution level (Table A.21) */
             if ((i != 0) && (((l_tmp & 0xf) == 0) || ((l_tmp >> 4) == 0))) {
@@ -10657,6 +11455,42 @@ static OPJ_BOOL opj_j2k_allocate_tile_element_cstr_index(opj_j2k_t *p_j2k)
     return OPJ_TRUE;
 }
 
+static OPJ_BOOL opj_j2k_are_all_used_components_decoded(opj_j2k_t *p_j2k,
+        opj_event_mgr_t * p_manager)
+{
+    OPJ_UINT32 compno;
+    OPJ_BOOL decoded_all_used_components = OPJ_TRUE;
+
+    if (p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode) {
+        for (compno = 0;
+                compno < p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode; compno++) {
+            OPJ_UINT32 dec_compno =
+                p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode[compno];
+            if (p_j2k->m_output_image->comps[dec_compno].data == NULL) {
+                opj_event_msg(p_manager, EVT_WARNING, "Failed to decode component %d\n",
+                              dec_compno);
+                decoded_all_used_components = OPJ_FALSE;
+            }
+        }
+    } else {
+        for (compno = 0; compno < p_j2k->m_output_image->numcomps; compno++) {
+            if (p_j2k->m_output_image->comps[compno].data == NULL) {
+                opj_event_msg(p_manager, EVT_WARNING, "Failed to decode component %d\n",
+                              compno);
+                decoded_all_used_components = OPJ_FALSE;
+            }
+        }
+    }
+
+    if (decoded_all_used_components == OPJ_FALSE) {
+        opj_event_msg(p_manager, EVT_ERROR, "Failed to decode all used components\n");
+        return OPJ_FALSE;
+    }
+
+    return OPJ_TRUE;
+}
+
+
 static OPJ_BOOL opj_j2k_decode_tiles(opj_j2k_t *p_j2k,
                                      opj_stream_private_t *p_stream,
                                      opj_event_mgr_t * p_manager)
@@ -10768,6 +11602,10 @@ static OPJ_BOOL opj_j2k_decode_tiles(opj_j2k_t *p_j2k,
         }
     }
 
+    if (! opj_j2k_are_all_used_components_decoded(p_j2k, p_manager)) {
+        return OPJ_FALSE;
+    }
+
     return OPJ_TRUE;
 }
 
@@ -10896,6 +11734,10 @@ static OPJ_BOOL opj_j2k_decode_one_tile(opj_j2k_t *p_j2k,
 
     }
 
+    if (! opj_j2k_are_all_used_components_decoded(p_j2k, p_manager)) {
+        return OPJ_FALSE;
+    }
+
     return OPJ_TRUE;
 }
 
@@ -11182,6 +12024,42 @@ OPJ_BOOL opj_j2k_set_decoded_resolution_factor(opj_j2k_t *p_j2k,
     return OPJ_FALSE;
 }
 
+/* ----------------------------------------------------------------------- */
+
+OPJ_BOOL opj_j2k_encoder_set_extra_options(
+    opj_j2k_t *p_j2k,
+    const char* const* p_options,
+    opj_event_mgr_t * p_manager)
+{
+    const char* const* p_option_iter;
+
+    if (p_options == NULL) {
+        return OPJ_TRUE;
+    }
+
+    for (p_option_iter = p_options; *p_option_iter != NULL; ++p_option_iter) {
+        if (strncmp(*p_option_iter, "PLT=", 4) == 0) {
+            if (strcmp(*p_option_iter, "PLT=YES") == 0) {
+                p_j2k->m_specific_param.m_encoder.m_PLT = OPJ_TRUE;
+            } else if (strcmp(*p_option_iter, "PLT=NO") == 0) {
+                p_j2k->m_specific_param.m_encoder.m_PLT = OPJ_FALSE;
+            } else {
+                opj_event_msg(p_manager, EVT_ERROR,
+                              "Invalid value for option: %s.\n", *p_option_iter);
+                return OPJ_FALSE;
+            }
+        } else {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "Invalid option: %s.\n", *p_option_iter);
+            return OPJ_FALSE;
+        }
+    }
+
+    return OPJ_TRUE;
+}
+
+/* ----------------------------------------------------------------------- */
+
 OPJ_BOOL opj_j2k_encode(opj_j2k_t * p_j2k,
                         opj_stream_private_t *p_stream,
                         opj_event_mgr_t * p_manager)
@@ -11239,7 +12117,7 @@ OPJ_BOOL opj_j2k_encode(opj_j2k_t * p_j2k,
                 }
             }
         }
-        l_current_tile_size = opj_tcd_get_encoded_tile_size(p_j2k->m_tcd);
+        l_current_tile_size = opj_tcd_get_encoder_input_buffer_size(p_j2k->m_tcd);
         if (!l_reuse_data) {
             if (l_current_tile_size > l_max_tile_size) {
                 OPJ_BYTE *l_new_current_data = (OPJ_BYTE *) opj_realloc(l_current_data,
@@ -11567,7 +12445,7 @@ static OPJ_BOOL opj_j2k_setup_end_compress(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-    if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz)) {
+    if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz) || OPJ_IS_IMF(p_j2k->m_cp.rsiz)) {
         if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list,
                                                (opj_procedure)opj_j2k_write_updated_tlm, p_manager)) {
             return OPJ_FALSE;
@@ -11650,7 +12528,7 @@ static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
-    if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz)) {
+    if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz) || OPJ_IS_IMF(p_j2k->m_cp.rsiz)) {
         if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list,
                                                (opj_procedure)opj_j2k_write_tlm, p_manager)) {
             return OPJ_FALSE;
@@ -11677,7 +12555,8 @@ static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k,
     }
 
     /* DEVELOPER CORNER, insert your custom procedures */
-    if (p_j2k->m_cp.rsiz & OPJ_EXTENSION_MCT) {
+    if ((p_j2k->m_cp.rsiz & (OPJ_PROFILE_PART2 | OPJ_EXTENSION_MCT)) ==
+            (OPJ_PROFILE_PART2 | OPJ_EXTENSION_MCT)) {
         if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list,
                                                (opj_procedure)opj_j2k_write_mct_data_group, p_manager)) {
             return OPJ_FALSE;
@@ -11707,7 +12586,7 @@ static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k,
 static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
         OPJ_BYTE * p_data,
         OPJ_UINT32 * p_data_written,
-        OPJ_UINT32 p_total_data_size,
+        OPJ_UINT32 total_data_size,
         opj_stream_private_t *p_stream,
         struct opj_event_mgr * p_manager)
 {
@@ -11731,7 +12610,7 @@ static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
 
     l_current_nb_bytes_written = 0;
     l_begin_data = p_data;
-    if (! opj_j2k_write_sot(p_j2k, p_data, p_total_data_size,
+    if (! opj_j2k_write_sot(p_j2k, p_data, total_data_size,
                             &l_current_nb_bytes_written, p_stream,
                             p_manager)) {
         return OPJ_FALSE;
@@ -11739,7 +12618,7 @@ static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
 
     l_nb_bytes_written += l_current_nb_bytes_written;
     p_data += l_current_nb_bytes_written;
-    p_total_data_size -= l_current_nb_bytes_written;
+    total_data_size -= l_current_nb_bytes_written;
 
     if (!OPJ_IS_CINEMA(l_cp->rsiz)) {
 #if 0
@@ -11749,29 +12628,29 @@ static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
                                         p_manager);
             l_nb_bytes_written += l_current_nb_bytes_written;
             p_data += l_current_nb_bytes_written;
-            p_total_data_size -= l_current_nb_bytes_written;
+            total_data_size -= l_current_nb_bytes_written;
 
             l_current_nb_bytes_written = 0;
             opj_j2k_write_qcc_in_memory(p_j2k, compno, p_data, &l_current_nb_bytes_written,
                                         p_manager);
             l_nb_bytes_written += l_current_nb_bytes_written;
             p_data += l_current_nb_bytes_written;
-            p_total_data_size -= l_current_nb_bytes_written;
+            total_data_size -= l_current_nb_bytes_written;
         }
 #endif
-        if (l_cp->tcps[p_j2k->m_current_tile_number].numpocs) {
+        if (l_cp->tcps[p_j2k->m_current_tile_number].POC) {
             l_current_nb_bytes_written = 0;
             opj_j2k_write_poc_in_memory(p_j2k, p_data, &l_current_nb_bytes_written,
                                         p_manager);
             l_nb_bytes_written += l_current_nb_bytes_written;
             p_data += l_current_nb_bytes_written;
-            p_total_data_size -= l_current_nb_bytes_written;
+            total_data_size -= l_current_nb_bytes_written;
         }
     }
 
     l_current_nb_bytes_written = 0;
     if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written,
-                            p_total_data_size, p_stream, p_manager)) {
+                            total_data_size, p_stream, p_manager)) {
         return OPJ_FALSE;
     }
 
@@ -11782,7 +12661,7 @@ static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
     opj_write_bytes(l_begin_data + 6, l_nb_bytes_written,
                     4);                                 /* PSOT */
 
-    if (OPJ_IS_CINEMA(l_cp->rsiz)) {
+    if (OPJ_IS_CINEMA(l_cp->rsiz) || OPJ_IS_IMF(l_cp->rsiz)) {
         opj_j2k_update_tlm(p_j2k, l_nb_bytes_written);
     }
 
@@ -11792,7 +12671,7 @@ static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k,
 static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
         OPJ_BYTE * p_data,
         OPJ_UINT32 * p_data_written,
-        OPJ_UINT32 p_total_data_size,
+        OPJ_UINT32 total_data_size,
         opj_stream_private_t *p_stream,
         struct opj_event_mgr * p_manager
                                             )
@@ -11825,7 +12704,7 @@ static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
         l_begin_data = p_data;
 
         if (! opj_j2k_write_sot(p_j2k, p_data,
-                                p_total_data_size,
+                                total_data_size,
                                 &l_current_nb_bytes_written,
                                 p_stream,
                                 p_manager)) {
@@ -11834,25 +12713,25 @@ static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
 
         l_nb_bytes_written += l_current_nb_bytes_written;
         p_data += l_current_nb_bytes_written;
-        p_total_data_size -= l_current_nb_bytes_written;
+        total_data_size -= l_current_nb_bytes_written;
         l_part_tile_size += l_current_nb_bytes_written;
 
         l_current_nb_bytes_written = 0;
         if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written,
-                                p_total_data_size, p_stream, p_manager)) {
+                                total_data_size, p_stream, p_manager)) {
             return OPJ_FALSE;
         }
 
         p_data += l_current_nb_bytes_written;
         l_nb_bytes_written += l_current_nb_bytes_written;
-        p_total_data_size -= l_current_nb_bytes_written;
+        total_data_size -= l_current_nb_bytes_written;
         l_part_tile_size += l_current_nb_bytes_written;
 
         /* Writing Psot in SOT marker */
         opj_write_bytes(l_begin_data + 6, l_part_tile_size,
                         4);                                   /* PSOT */
 
-        if (OPJ_IS_CINEMA(l_cp->rsiz)) {
+        if (OPJ_IS_CINEMA(l_cp->rsiz) || OPJ_IS_IMF(l_cp->rsiz)) {
             opj_j2k_update_tlm(p_j2k, l_part_tile_size);
         }
 
@@ -11871,7 +12750,7 @@ static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
             l_begin_data = p_data;
 
             if (! opj_j2k_write_sot(p_j2k, p_data,
-                                    p_total_data_size,
+                                    total_data_size,
                                     &l_current_nb_bytes_written, p_stream,
                                     p_manager)) {
                 return OPJ_FALSE;
@@ -11879,26 +12758,26 @@ static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k,
 
             l_nb_bytes_written += l_current_nb_bytes_written;
             p_data += l_current_nb_bytes_written;
-            p_total_data_size -= l_current_nb_bytes_written;
+            total_data_size -= l_current_nb_bytes_written;
             l_part_tile_size += l_current_nb_bytes_written;
 
             l_current_nb_bytes_written = 0;
 
             if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written,
-                                    p_total_data_size, p_stream, p_manager)) {
+                                    total_data_size, p_stream, p_manager)) {
                 return OPJ_FALSE;
             }
 
             l_nb_bytes_written += l_current_nb_bytes_written;
             p_data += l_current_nb_bytes_written;
-            p_total_data_size -= l_current_nb_bytes_written;
+            total_data_size -= l_current_nb_bytes_written;
             l_part_tile_size += l_current_nb_bytes_written;
 
             /* Writing Psot in SOT marker */
             opj_write_bytes(l_begin_data + 6, l_part_tile_size,
                             4);                                   /* PSOT */
 
-            if (OPJ_IS_CINEMA(l_cp->rsiz)) {
+            if (OPJ_IS_CINEMA(l_cp->rsiz) || OPJ_IS_IMF(l_cp->rsiz)) {
                 opj_j2k_update_tlm(p_j2k, l_part_tile_size);
             }
 
diff --git a/3rdparty/openjpeg/openjp2/j2k.h b/3rdparty/openjpeg/openjp2/j2k.h
index 5d393c9813..9eb50b50da 100644
--- a/3rdparty/openjpeg/openjp2/j2k.h
+++ b/3rdparty/openjpeg/openjp2/j2k.h
@@ -531,8 +531,14 @@ typedef struct opj_j2k_enc {
     OPJ_BYTE * m_header_tile_data;
 
     /* size of the encoded_data */
+
     OPJ_UINT32 m_header_tile_data_size;
 
+    /* whether to generate PLT markers */
+    OPJ_BOOL   m_PLT;
+
+    /* reserved bytes in m_encoded_tile_size for PLT markers */
+    OPJ_UINT32 m_reserved_bytes_for_PLT;
 
 } opj_j2k_enc_t;
 
@@ -577,15 +583,16 @@ typedef struct opj_j2k {
     /** the current tile coder/decoder **/
     struct opj_tcd *    m_tcd;
 
-    /** Number of threads to use */
-    int m_num_threads;
-
     /** Thread pool */
     opj_thread_pool_t* m_tp;
 
+    /** Image width coming from JP2 IHDR box. 0 from a pure codestream */
     OPJ_UINT32 ihdr_w;
+
+    /** Image height coming from JP2 IHDR box. 0 from a pure codestream */
     OPJ_UINT32 ihdr_h;
-    OPJ_UINT32 enumcs;
+
+    /** Set to 1 by the decoder initialization if OPJ_DPARAMETERS_DUMP_FLAG is set */
     unsigned int dump_state;
 }
 opj_j2k_t;
@@ -827,6 +834,19 @@ OPJ_BOOL opj_j2k_set_decoded_resolution_factor(opj_j2k_t *p_j2k,
         OPJ_UINT32 res_factor,
         opj_event_mgr_t * p_manager);
 
+/**
+ * Specify extra options for the encoder.
+ *
+ * @param  p_j2k        the jpeg2000 codec.
+ * @param  p_options    options
+ * @param  p_manager    the user event manager
+ *
+ * @see opj_encoder_set_extra_options() for more details.
+ */
+OPJ_BOOL opj_j2k_encoder_set_extra_options(
+    opj_j2k_t *p_j2k,
+    const char* const* p_options,
+    opj_event_mgr_t * p_manager);
 
 /**
  * Writes a tile.
diff --git a/3rdparty/openjpeg/openjp2/jp2.c b/3rdparty/openjpeg/openjp2/jp2.c
index 4402ffe3c5..7c065ba742 100644
--- a/3rdparty/openjpeg/openjp2/jp2.c
+++ b/3rdparty/openjpeg/openjp2/jp2.c
@@ -586,6 +586,12 @@ static OPJ_BOOL opj_jp2_read_ihdr(opj_jp2_t *jp2,
     opj_read_bytes(p_image_header_data, &(jp2->numcomps), 2);   /* NC */
     p_image_header_data += 2;
 
+    if (jp2->h < 1 || jp2->w < 1 || jp2->numcomps < 1) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Wrong values for: w(%d) h(%d) numcomps(%d) (ihdr)\n",
+                      jp2->w, jp2->h, jp2->numcomps);
+        return OPJ_FALSE;
+    }
     if ((jp2->numcomps - 1U) >=
             16384U) { /* unsigned underflow is well defined: 1U <= jp2->numcomps <= 16384U */
         opj_event_msg(p_manager, EVT_ERROR, "Invalid number of components (ihdr)\n");
@@ -1584,9 +1590,7 @@ static OPJ_BOOL opj_jp2_read_colr(opj_jp2_t *jp2,
                       "COLR BOX meth value is not a regular value (%d), "
                       "so we will ignore the entire Colour Specification box. \n", jp2->meth);
     }
-    if (jp2->color.jp2_has_colr) {
-        jp2->j2k->enumcs = jp2->enumcs;
-    }
+
     return OPJ_TRUE;
 }
 
@@ -3236,6 +3240,18 @@ OPJ_BOOL opj_jp2_set_decoded_resolution_factor(opj_jp2_t *p_jp2,
     return opj_j2k_set_decoded_resolution_factor(p_jp2->j2k, res_factor, p_manager);
 }
 
+/* ----------------------------------------------------------------------- */
+
+OPJ_BOOL opj_jp2_encoder_set_extra_options(
+    opj_jp2_t *p_jp2,
+    const char* const* p_options,
+    opj_event_mgr_t * p_manager)
+{
+    return opj_j2k_encoder_set_extra_options(p_jp2->j2k, p_options, p_manager);
+}
+
+/* ----------------------------------------------------------------------- */
+
 /* JPIP specific */
 
 #ifdef USE_JPIP
diff --git a/3rdparty/openjpeg/openjp2/jp2.h b/3rdparty/openjpeg/openjp2/jp2.h
index 34abd5118e..9e7fa56674 100644
--- a/3rdparty/openjpeg/openjp2/jp2.h
+++ b/3rdparty/openjpeg/openjp2/jp2.h
@@ -459,6 +459,20 @@ OPJ_BOOL opj_jp2_set_decoded_resolution_factor(opj_jp2_t *p_jp2,
         OPJ_UINT32 res_factor,
         opj_event_mgr_t * p_manager);
 
+/**
+ * Specify extra options for the encoder.
+ *
+ * @param  p_jp2        the jpeg2000 codec.
+ * @param  p_options    options
+ * @param  p_manager    the user event manager
+ *
+ * @see opj_encoder_set_extra_options() for more details.
+ */
+OPJ_BOOL opj_jp2_encoder_set_extra_options(
+    opj_jp2_t *p_jp2,
+    const char* const* p_options,
+    opj_event_mgr_t * p_manager);
+
 
 /* TODO MSD: clean these 3 functions */
 /**
diff --git a/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in b/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in
deleted file mode 100644
index 62159b00a4..0000000000
--- a/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in
+++ /dev/null
@@ -1,14 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-bindir=${prefix}/@OPENJPEG_INSTALL_BIN_DIR@
-mandir=${prefix}/@OPENJPEG_INSTALL_MAN_DIR@
-docdir=${prefix}/@OPENJPEG_INSTALL_DOC_DIR@
-libdir=${prefix}/@OPENJPEG_INSTALL_LIB_DIR@
-includedir=${prefix}/@OPENJPEG_INSTALL_INCLUDE_DIR@
-
-Name: openjp2
-Description: JPEG2000 library (Part 1 and 2)
-URL: http://www.openjpeg.org/
-Version: @OPENJPEG_VERSION@
-Libs: -L${libdir} -lopenjp2
-Libs.private: -lm
-Cflags: -I${includedir}
diff --git a/3rdparty/openjpeg/openjp2/mct.c b/3rdparty/openjpeg/openjp2/mct.c
index b79d4b87c4..88c8f40920 100644
--- a/3rdparty/openjpeg/openjp2/mct.c
+++ b/3rdparty/openjpeg/openjp2/mct.c
@@ -183,7 +183,7 @@ void opj_mct_decode(
     OPJ_INT32* OPJ_RESTRICT c2,
     OPJ_SIZE_T n)
 {
-    OPJ_UINT32 i;
+    OPJ_SIZE_T i;
     for (i = 0; i < n; ++i) {
         OPJ_INT32 y = c0[i];
         OPJ_INT32 u = c1[i];
@@ -209,175 +209,72 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
 /* <summary> */
 /* Forward irreversible MCT. */
 /* </summary> */
-#ifdef __SSE4_1__
 void opj_mct_encode_real(
-    OPJ_INT32* OPJ_RESTRICT c0,
-    OPJ_INT32* OPJ_RESTRICT c1,
-    OPJ_INT32* OPJ_RESTRICT c2,
+    OPJ_FLOAT32* OPJ_RESTRICT c0,
+    OPJ_FLOAT32* OPJ_RESTRICT c1,
+    OPJ_FLOAT32* OPJ_RESTRICT c2,
     OPJ_SIZE_T n)
 {
     OPJ_SIZE_T i;
-    const OPJ_SIZE_T len = n;
+#ifdef __SSE__
+    const __m128 YR = _mm_set1_ps(0.299f);
+    const __m128 YG = _mm_set1_ps(0.587f);
+    const __m128 YB = _mm_set1_ps(0.114f);
+    const __m128 UR = _mm_set1_ps(-0.16875f);
+    const __m128 UG = _mm_set1_ps(-0.331260f);
+    const __m128 UB = _mm_set1_ps(0.5f);
+    const __m128 VR = _mm_set1_ps(0.5f);
+    const __m128 VG = _mm_set1_ps(-0.41869f);
+    const __m128 VB = _mm_set1_ps(-0.08131f);
+    for (i = 0; i < (n >> 3); i ++) {
+        __m128 r, g, b, y, u, v;
 
-    const __m128i ry = _mm_set1_epi32(2449);
-    const __m128i gy = _mm_set1_epi32(4809);
-    const __m128i by = _mm_set1_epi32(934);
-    const __m128i ru = _mm_set1_epi32(1382);
-    const __m128i gu = _mm_set1_epi32(2714);
-    /* const __m128i bu = _mm_set1_epi32(4096); */
-    /* const __m128i rv = _mm_set1_epi32(4096); */
-    const __m128i gv = _mm_set1_epi32(3430);
-    const __m128i bv = _mm_set1_epi32(666);
-    const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096),
-                             _MM_SHUFFLE(1, 0, 1, 0));
+        r = _mm_load_ps(c0);
+        g = _mm_load_ps(c1);
+        b = _mm_load_ps(c2);
+        y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
+                       _mm_mul_ps(b, YB));
+        u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
+                       _mm_mul_ps(b, UB));
+        v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
+                       _mm_mul_ps(b, VB));
+        _mm_store_ps(c0, y);
+        _mm_store_ps(c1, u);
+        _mm_store_ps(c2, v);
+        c0 += 4;
+        c1 += 4;
+        c2 += 4;
 
-    for (i = 0; i < (len & ~3U); i += 4) {
-        __m128i lo, hi;
-        __m128i y, u, v;
-        __m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
-        __m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
-        __m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
-
-        lo = r;
-        hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, ry);
-        hi = _mm_mul_epi32(hi, ry);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        y = _mm_blend_epi16(lo, hi, 0xCC);
-
-        lo = g;
-        hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, gy);
-        hi = _mm_mul_epi32(hi, gy);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
-
-        lo = b;
-        hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, by);
-        hi = _mm_mul_epi32(hi, by);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
-        _mm_store_si128((__m128i *) & (c0[i]), y);
-
-        /*lo = b;
-        hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, mulround);
-        hi = _mm_mul_epi32(hi, mulround);*/
-        lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0)));
-        hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1)));
-        lo = _mm_slli_epi64(lo, 12);
-        hi = _mm_slli_epi64(hi, 12);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        u = _mm_blend_epi16(lo, hi, 0xCC);
-
-        lo = r;
-        hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, ru);
-        hi = _mm_mul_epi32(hi, ru);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
-
-        lo = g;
-        hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, gu);
-        hi = _mm_mul_epi32(hi, gu);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
-        _mm_store_si128((__m128i *) & (c1[i]), u);
-
-        /*lo = r;
-        hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, mulround);
-        hi = _mm_mul_epi32(hi, mulround);*/
-        lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0)));
-        hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1)));
-        lo = _mm_slli_epi64(lo, 12);
-        hi = _mm_slli_epi64(hi, 12);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        v = _mm_blend_epi16(lo, hi, 0xCC);
-
-        lo = g;
-        hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, gv);
-        hi = _mm_mul_epi32(hi, gv);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
-
-        lo = b;
-        hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
-        lo = _mm_mul_epi32(lo, bv);
-        hi = _mm_mul_epi32(hi, bv);
-        lo = _mm_add_epi64(lo, mulround);
-        hi = _mm_add_epi64(hi, mulround);
-        lo = _mm_srli_epi64(lo, 13);
-        hi = _mm_slli_epi64(hi, 32 - 13);
-        v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
-        _mm_store_si128((__m128i *) & (c2[i]), v);
+        r = _mm_load_ps(c0);
+        g = _mm_load_ps(c1);
+        b = _mm_load_ps(c2);
+        y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
+                       _mm_mul_ps(b, YB));
+        u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
+                       _mm_mul_ps(b, UB));
+        v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
+                       _mm_mul_ps(b, VB));
+        _mm_store_ps(c0, y);
+        _mm_store_ps(c1, u);
+        _mm_store_ps(c2, v);
+        c0 += 4;
+        c1 += 4;
+        c2 += 4;
     }
-    for (; i < len; ++i) {
-        OPJ_INT32 r = c0[i];
-        OPJ_INT32 g = c1[i];
-        OPJ_INT32 b = c2[i];
-        OPJ_INT32 y =  opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
-                       4809) + opj_int_fix_mul(b, 934);
-        OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
-                      2714) + opj_int_fix_mul(b, 4096);
-        OPJ_INT32 v =  opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
-                       3430) - opj_int_fix_mul(b, 666);
-        c0[i] = y;
-        c1[i] = u;
-        c2[i] = v;
-    }
-}
-#else
-void opj_mct_encode_real(
-    OPJ_INT32* OPJ_RESTRICT c0,
-    OPJ_INT32* OPJ_RESTRICT c1,
-    OPJ_INT32* OPJ_RESTRICT c2,
-    OPJ_SIZE_T n)
-{
-    OPJ_UINT32 i;
-    for (i = 0; i < n; ++i) {
-        OPJ_INT32 r = c0[i];
-        OPJ_INT32 g = c1[i];
-        OPJ_INT32 b = c2[i];
-        OPJ_INT32 y =  opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
-                       4809) + opj_int_fix_mul(b, 934);
-        OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
-                      2714) + opj_int_fix_mul(b, 4096);
-        OPJ_INT32 v =  opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
-                       3430) - opj_int_fix_mul(b, 666);
-        c0[i] = y;
-        c1[i] = u;
-        c2[i] = v;
-    }
-}
+    n &= 7;
 #endif
+    for (i = 0; i < n; ++i) {
+        OPJ_FLOAT32 r = c0[i];
+        OPJ_FLOAT32 g = c1[i];
+        OPJ_FLOAT32 b = c2[i];
+        OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
+        OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
+        OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
+        c0[i] = y;
+        c1[i] = u;
+        c2[i] = v;
+    }
+}
 
 /* <summary> */
 /* Inverse irreversible MCT. */
@@ -388,7 +285,7 @@ void opj_mct_decode_real(
     OPJ_FLOAT32* OPJ_RESTRICT c2,
     OPJ_SIZE_T n)
 {
-    OPJ_UINT32 i;
+    OPJ_SIZE_T i;
 #ifdef __SSE__
     __m128 vrv, vgu, vgv, vbu;
     vrv = _mm_set1_ps(1.402f);
diff --git a/3rdparty/openjpeg/openjp2/mct.h b/3rdparty/openjpeg/openjp2/mct.h
index 2e37ce7333..3e1f5e4946 100644
--- a/3rdparty/openjpeg/openjp2/mct.h
+++ b/3rdparty/openjpeg/openjp2/mct.h
@@ -85,8 +85,9 @@ Apply an irreversible multi-component transform to an image
 @param c2 Samples blue component
 @param n Number of samples for each component
 */
-void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1,
-                         OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
+void opj_mct_encode_real(OPJ_FLOAT32* OPJ_RESTRICT c0,
+                         OPJ_FLOAT32* OPJ_RESTRICT c1,
+                         OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
 /**
 Apply an irreversible multi-component inverse transform to an image
 @param c0 Samples for luminance component
diff --git a/3rdparty/openjpeg/openjp2/mqc.c b/3rdparty/openjpeg/openjp2/mqc.c
index 6299b171d8..4cbfabd033 100644
--- a/3rdparty/openjpeg/openjp2/mqc.c
+++ b/3rdparty/openjpeg/openjp2/mqc.c
@@ -46,27 +46,6 @@
 /** @name Local static functions */
 /*@{*/
 
-/**
-Output a byte, doing bit-stuffing if necessary.
-After a 0xff byte, the next byte must be smaller than 0x90.
-@param mqc MQC handle
-*/
-static void opj_mqc_byteout(opj_mqc_t *mqc);
-/**
-Renormalize mqc->a and mqc->c while encoding, so that mqc->a stays between 0x8000 and 0x10000
-@param mqc MQC handle
-*/
-static void opj_mqc_renorme(opj_mqc_t *mqc);
-/**
-Encode the most probable symbol
-@param mqc MQC handle
-*/
-static void opj_mqc_codemps(opj_mqc_t *mqc);
-/**
-Encode the most least symbol
-@param mqc MQC handle
-*/
-static void opj_mqc_codelps(opj_mqc_t *mqc);
 /**
 Fill mqc->c with 1's for flushing
 @param mqc MQC handle
@@ -182,80 +161,6 @@ static const opj_mqc_state_t mqc_states[47 * 2] = {
 ==========================================================
 */
 
-static void opj_mqc_byteout(opj_mqc_t *mqc)
-{
-    /* bp is initialized to start - 1 in opj_mqc_init_enc() */
-    /* but this is safe, see opj_tcd_code_block_enc_allocate_data() */
-    assert(mqc->bp >= mqc->start - 1);
-    if (*mqc->bp == 0xff) {
-        mqc->bp++;
-        *mqc->bp = (OPJ_BYTE)(mqc->c >> 20);
-        mqc->c &= 0xfffff;
-        mqc->ct = 7;
-    } else {
-        if ((mqc->c & 0x8000000) == 0) {
-            mqc->bp++;
-            *mqc->bp = (OPJ_BYTE)(mqc->c >> 19);
-            mqc->c &= 0x7ffff;
-            mqc->ct = 8;
-        } else {
-            (*mqc->bp)++;
-            if (*mqc->bp == 0xff) {
-                mqc->c &= 0x7ffffff;
-                mqc->bp++;
-                *mqc->bp = (OPJ_BYTE)(mqc->c >> 20);
-                mqc->c &= 0xfffff;
-                mqc->ct = 7;
-            } else {
-                mqc->bp++;
-                *mqc->bp = (OPJ_BYTE)(mqc->c >> 19);
-                mqc->c &= 0x7ffff;
-                mqc->ct = 8;
-            }
-        }
-    }
-}
-
-static void opj_mqc_renorme(opj_mqc_t *mqc)
-{
-    do {
-        mqc->a <<= 1;
-        mqc->c <<= 1;
-        mqc->ct--;
-        if (mqc->ct == 0) {
-            opj_mqc_byteout(mqc);
-        }
-    } while ((mqc->a & 0x8000) == 0);
-}
-
-static void opj_mqc_codemps(opj_mqc_t *mqc)
-{
-    mqc->a -= (*mqc->curctx)->qeval;
-    if ((mqc->a & 0x8000) == 0) {
-        if (mqc->a < (*mqc->curctx)->qeval) {
-            mqc->a = (*mqc->curctx)->qeval;
-        } else {
-            mqc->c += (*mqc->curctx)->qeval;
-        }
-        *mqc->curctx = (*mqc->curctx)->nmps;
-        opj_mqc_renorme(mqc);
-    } else {
-        mqc->c += (*mqc->curctx)->qeval;
-    }
-}
-
-static void opj_mqc_codelps(opj_mqc_t *mqc)
-{
-    mqc->a -= (*mqc->curctx)->qeval;
-    if (mqc->a < (*mqc->curctx)->qeval) {
-        mqc->c += (*mqc->curctx)->qeval;
-    } else {
-        mqc->a = (*mqc->curctx)->qeval;
-    }
-    *mqc->curctx = (*mqc->curctx)->nlps;
-    opj_mqc_renorme(mqc);
-}
-
 static void opj_mqc_setbits(opj_mqc_t *mqc)
 {
     OPJ_UINT32 tempc = mqc->c + mqc->a;
@@ -303,14 +208,6 @@ void opj_mqc_init_enc(opj_mqc_t *mqc, OPJ_BYTE *bp)
     mqc->end_of_byte_stream_counter = 0;
 }
 
-void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d)
-{
-    if ((*mqc->curctx)->mps == d) {
-        opj_mqc_codemps(mqc);
-    } else {
-        opj_mqc_codelps(mqc);
-    }
-}
 
 void opj_mqc_flush(opj_mqc_t *mqc)
 {
@@ -329,8 +226,6 @@ void opj_mqc_flush(opj_mqc_t *mqc)
     }
 }
 
-#define BYPASS_CT_INIT  0xDEADBEEF
-
 void opj_mqc_bypass_init_enc(opj_mqc_t *mqc)
 {
     /* This function is normally called after at least one opj_mqc_flush() */
@@ -475,6 +370,43 @@ void opj_mqc_erterm_enc(opj_mqc_t *mqc)
     }
 }
 
+static INLINE void opj_mqc_renorme(opj_mqc_t *mqc)
+{
+    opj_mqc_renorme_macro(mqc, mqc->a, mqc->c, mqc->ct);
+}
+
+/**
+Encode the most probable symbol
+@param mqc MQC handle
+*/
+static INLINE void opj_mqc_codemps(opj_mqc_t *mqc)
+{
+    opj_mqc_codemps_macro(mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct);
+}
+
+/**
+Encode the most least symbol
+@param mqc MQC handle
+*/
+static INLINE void opj_mqc_codelps(opj_mqc_t *mqc)
+{
+    opj_mqc_codelps_macro(mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct);
+}
+
+/**
+Encode a symbol using the MQ-coder
+@param mqc MQC handle
+@param d The symbol to be encoded (0 or 1)
+*/
+static INLINE void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d)
+{
+    if ((*mqc->curctx)->mps == d) {
+        opj_mqc_codemps(mqc);
+    } else {
+        opj_mqc_codelps(mqc);
+    }
+}
+
 void opj_mqc_segmark_enc(opj_mqc_t *mqc)
 {
     OPJ_UINT32 i;
@@ -557,4 +489,36 @@ void opj_mqc_setstate(opj_mqc_t *mqc, OPJ_UINT32 ctxno, OPJ_UINT32 msb,
     mqc->ctxs[ctxno] = &mqc_states[msb + (OPJ_UINT32)(prob << 1)];
 }
 
-
+void opj_mqc_byteout(opj_mqc_t *mqc)
+{
+    /* bp is initialized to start - 1 in opj_mqc_init_enc() */
+    /* but this is safe, see opj_tcd_code_block_enc_allocate_data() */
+    assert(mqc->bp >= mqc->start - 1);
+    if (*mqc->bp == 0xff) {
+        mqc->bp++;
+        *mqc->bp = (OPJ_BYTE)(mqc->c >> 20);
+        mqc->c &= 0xfffff;
+        mqc->ct = 7;
+    } else {
+        if ((mqc->c & 0x8000000) == 0) {
+            mqc->bp++;
+            *mqc->bp = (OPJ_BYTE)(mqc->c >> 19);
+            mqc->c &= 0x7ffff;
+            mqc->ct = 8;
+        } else {
+            (*mqc->bp)++;
+            if (*mqc->bp == 0xff) {
+                mqc->c &= 0x7ffffff;
+                mqc->bp++;
+                *mqc->bp = (OPJ_BYTE)(mqc->c >> 20);
+                mqc->c &= 0xfffff;
+                mqc->ct = 7;
+            } else {
+                mqc->bp++;
+                *mqc->bp = (OPJ_BYTE)(mqc->c >> 19);
+                mqc->c &= 0x7ffff;
+                mqc->ct = 8;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/openjpeg/openjp2/mqc.h b/3rdparty/openjpeg/openjp2/mqc.h
index 69a2a79dc0..9850fed031 100644
--- a/3rdparty/openjpeg/openjp2/mqc.h
+++ b/3rdparty/openjpeg/openjp2/mqc.h
@@ -96,6 +96,8 @@ typedef struct opj_mqc {
     OPJ_BYTE backup[OPJ_COMMON_CBLK_DATA_EXTRA];
 } opj_mqc_t;
 
+#define BYPASS_CT_INIT  0xDEADBEEF
+
 #include "mqc_inl.h"
 
 /** @name Exported functions */
@@ -135,12 +137,7 @@ Set the current context used for coding/decoding
 @param ctxno Number that identifies the context
 */
 #define opj_mqc_setcurctx(mqc, ctxno)   (mqc)->curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)]
-/**
-Encode a symbol using the MQ-coder
-@param mqc MQC handle
-@param d The symbol to be encoded (0 or 1)
-*/
-void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d);
+
 /**
 Flush the encoder, so that all remaining data is written
 @param mqc MQC handle
diff --git a/3rdparty/openjpeg/openjp2/mqc_inl.h b/3rdparty/openjpeg/openjp2/mqc_inl.h
index 310a3287fd..0031b94be3 100644
--- a/3rdparty/openjpeg/openjp2/mqc_inl.h
+++ b/3rdparty/openjpeg/openjp2/mqc_inl.h
@@ -156,13 +156,13 @@ static INLINE OPJ_UINT32 opj_mqc_raw_decode(opj_mqc_t *mqc)
     } \
 }
 
-#define DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \
+#define DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct) \
         register const opj_mqc_state_t **curctx = mqc->curctx; \
         register OPJ_UINT32 c = mqc->c; \
         register OPJ_UINT32 a = mqc->a; \
         register OPJ_UINT32 ct = mqc->ct
 
-#define UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \
+#define UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct) \
         mqc->curctx = curctx; \
         mqc->c = c; \
         mqc->a = a; \
@@ -193,4 +193,90 @@ Decode a symbol
 #define opj_mqc_decode(d, mqc) \
     opj_mqc_decode_macro(d, mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct)
 
+/**
+Output a byte, doing bit-stuffing if necessary.
+After a 0xff byte, the next byte must be smaller than 0x90.
+@param mqc MQC handle
+*/
+void opj_mqc_byteout(opj_mqc_t *mqc);
+
+/**
+Renormalize mqc->a and mqc->c while encoding, so that mqc->a stays between 0x8000 and 0x10000
+@param mqc MQC handle
+@param a_ value of mqc->a
+@param c_ value of mqc->c_
+@param ct_ value of mqc->ct_
+*/
+#define opj_mqc_renorme_macro(mqc, a_, c_, ct_) \
+{ \
+    do { \
+        a_ <<= 1; \
+        c_ <<= 1; \
+        ct_--; \
+        if (ct_ == 0) { \
+            mqc->c = c_; \
+            opj_mqc_byteout(mqc); \
+            c_ = mqc->c; \
+            ct_ = mqc->ct; \
+        } \
+    } while( (a_ & 0x8000) == 0); \
+}
+
+#define opj_mqc_codemps_macro(mqc, curctx, a, c, ct) \
+{ \
+    a -= (*curctx)->qeval; \
+    if ((a & 0x8000) == 0) { \
+        if (a < (*curctx)->qeval) { \
+            a = (*curctx)->qeval; \
+        } else { \
+            c += (*curctx)->qeval; \
+        } \
+        *curctx = (*curctx)->nmps; \
+        opj_mqc_renorme_macro(mqc, a, c, ct); \
+    } else { \
+        c += (*curctx)->qeval; \
+    } \
+}
+
+#define opj_mqc_codelps_macro(mqc, curctx, a, c, ct) \
+{ \
+    a -= (*curctx)->qeval; \
+    if (a < (*curctx)->qeval) { \
+        c += (*curctx)->qeval; \
+    } else { \
+        a = (*curctx)->qeval; \
+    } \
+    *curctx = (*curctx)->nlps; \
+    opj_mqc_renorme_macro(mqc, a, c, ct); \
+}
+
+#define opj_mqc_encode_macro(mqc, curctx, a, c, ct, d) \
+{ \
+    if ((*curctx)->mps == (d)) { \
+        opj_mqc_codemps_macro(mqc, curctx, a, c, ct); \
+    } else { \
+        opj_mqc_codelps_macro(mqc, curctx, a, c, ct); \
+    } \
+}
+
+
+#define opj_mqc_bypass_enc_macro(mqc, c, ct, d) \
+{\
+    if (ct == BYPASS_CT_INIT) {\
+        ct = 8;\
+    }\
+    ct--;\
+    c = c + ((d) << ct);\
+    if (ct == 0) {\
+        *mqc->bp = (OPJ_BYTE)c;\
+        ct = 8;\
+        /* If the previous byte was 0xff, make sure that the next msb is 0 */ \
+        if (*mqc->bp == 0xff) {\
+            ct = 7;\
+        }\
+        mqc->bp++;\
+        c = 0;\
+    }\
+}
+
 #endif /* OPJ_MQC_INL_H */
diff --git a/3rdparty/openjpeg/openjp2/openjpeg.c b/3rdparty/openjpeg/openjp2/openjpeg.c
index 7b12303423..9c9b6eb0c0 100644
--- a/3rdparty/openjpeg/openjp2/openjpeg.c
+++ b/3rdparty/openjpeg/openjp2/openjpeg.c
@@ -652,6 +652,14 @@ opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT p_format)
                 struct opj_image *,
                 struct opj_event_mgr *)) opj_j2k_setup_encoder;
 
+        l_codec->m_codec_data.m_compression.opj_encoder_set_extra_options = (OPJ_BOOL(
+                    *)(void *,
+                       const char* const*,
+                       struct opj_event_mgr *)) opj_j2k_encoder_set_extra_options;
+
+        l_codec->opj_set_threads =
+            (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_j2k_set_threads;
+
         l_codec->m_codec = opj_j2k_create_compress();
         if (! l_codec->m_codec) {
             opj_free(l_codec);
@@ -690,6 +698,14 @@ opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT p_format)
                 struct opj_image *,
                 struct opj_event_mgr *)) opj_jp2_setup_encoder;
 
+        l_codec->m_codec_data.m_compression.opj_encoder_set_extra_options = (OPJ_BOOL(
+                    *)(void *,
+                       const char* const*,
+                       struct opj_event_mgr *)) opj_jp2_encoder_set_extra_options;
+
+        l_codec->opj_set_threads =
+            (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_jp2_set_threads;
+
         l_codec->m_codec = opj_jp2_create(OPJ_FALSE);
         if (! l_codec->m_codec) {
             opj_free(l_codec);
@@ -718,11 +734,11 @@ void OPJ_CALLCONV opj_set_default_encoder_parameters(opj_cparameters_t
         parameters->cp_cinema = OPJ_OFF; /* DEPRECATED */
         parameters->rsiz = OPJ_PROFILE_NONE;
         parameters->max_comp_size = 0;
-        parameters->numresolution = 6;
+        parameters->numresolution = OPJ_COMP_PARAM_DEFAULT_NUMRESOLUTION;
         parameters->cp_rsiz = OPJ_STD_RSIZ; /* DEPRECATED */
-        parameters->cblockw_init = 64;
-        parameters->cblockh_init = 64;
-        parameters->prog_order = OPJ_LRCP;
+        parameters->cblockw_init = OPJ_COMP_PARAM_DEFAULT_CBLOCKW;
+        parameters->cblockh_init = OPJ_COMP_PARAM_DEFAULT_CBLOCKH;
+        parameters->prog_order = OPJ_COMP_PARAM_DEFAULT_PROG_ORDER;
         parameters->roi_compno = -1;        /* no ROI */
         parameters->subsampling_dx = 1;
         parameters->subsampling_dy = 1;
@@ -788,6 +804,27 @@ OPJ_BOOL OPJ_CALLCONV opj_setup_encoder(opj_codec_t *p_codec,
     return OPJ_FALSE;
 }
 
+/* ----------------------------------------------------------------------- */
+
+OPJ_BOOL OPJ_CALLCONV opj_encoder_set_extra_options(opj_codec_t *p_codec,
+        const char* const* options)
+{
+    if (p_codec) {
+        opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec;
+
+        if (! l_codec->is_decompressor) {
+            return l_codec->m_codec_data.m_compression.opj_encoder_set_extra_options(
+                       l_codec->m_codec,
+                       options,
+                       &(l_codec->m_event_mgr));
+        }
+    }
+
+    return OPJ_FALSE;
+}
+
+/* ----------------------------------------------------------------------- */
+
 OPJ_BOOL OPJ_CALLCONV opj_start_compress(opj_codec_t *p_codec,
         opj_image_t * p_image,
         opj_stream_t *p_stream)
diff --git a/3rdparty/openjpeg/openjp2/openjpeg.h b/3rdparty/openjpeg/openjp2/openjpeg.h
index 53a0e10c54..269ac329ae 100644
--- a/3rdparty/openjpeg/openjp2/openjpeg.h
+++ b/3rdparty/openjpeg/openjp2/openjpeg.h
@@ -78,7 +78,7 @@ Most compilers implement their own version of this keyword ...
 
 #if defined(OPJ_STATIC) || !defined(_WIN32)
 /* http://gcc.gnu.org/wiki/Visibility */
-#   if __GNUC__ >= 4
+#   if !defined(_WIN32) && __GNUC__ >= 4
 #       if defined(OPJ_STATIC) /* static library uses "hidden" */
 #           define OPJ_API    __attribute__ ((visibility ("hidden")))
 #       else
@@ -204,11 +204,11 @@ typedef size_t   OPJ_SIZE_T;
 #define OPJ_PROFILE_BC_MULTI    0x0200 /** Multi Tile Broadcast profile defined in 15444-1 AMD3 */
 #define OPJ_PROFILE_BC_MULTI_R  0x0300 /** Multi Tile Reversible Broadcast profile defined in 15444-1 AMD3 */
 #define OPJ_PROFILE_IMF_2K      0x0400 /** 2K Single Tile Lossy IMF profile defined in 15444-1 AMD 8 */
-#define OPJ_PROFILE_IMF_4K      0x0401 /** 4K Single Tile Lossy IMF profile defined in 15444-1 AMD 8 */
-#define OPJ_PROFILE_IMF_8K      0x0402 /** 8K Single Tile Lossy IMF profile defined in 15444-1 AMD 8 */
-#define OPJ_PROFILE_IMF_2K_R    0x0403 /** 2K Single/Multi Tile Reversible IMF profile defined in 15444-1 AMD 8 */
+#define OPJ_PROFILE_IMF_4K      0x0500 /** 4K Single Tile Lossy IMF profile defined in 15444-1 AMD 8 */
+#define OPJ_PROFILE_IMF_8K      0x0600 /** 8K Single Tile Lossy IMF profile defined in 15444-1 AMD 8 */
+#define OPJ_PROFILE_IMF_2K_R    0x0700 /** 2K Single/Multi Tile Reversible IMF profile defined in 15444-1 AMD 8 */
 #define OPJ_PROFILE_IMF_4K_R    0x0800 /** 4K Single/Multi Tile Reversible IMF profile defined in 15444-1 AMD 8 */
-#define OPJ_PROFILE_IMF_8K_R    0x0801  /** 8K Single/Multi Tile Reversible IMF profile defined in 15444-1 AMD 8 */
+#define OPJ_PROFILE_IMF_8K_R    0x0900 /** 8K Single/Multi Tile Reversible IMF profile defined in 15444-1 AMD 8 */
 
 /**
  * JPEG 2000 Part-2 extensions
@@ -225,6 +225,36 @@ typedef size_t   OPJ_SIZE_T;
 #define OPJ_IS_IMF(v)        (((v) >= OPJ_PROFILE_IMF_2K)&&((v) <= ((OPJ_PROFILE_IMF_8K_R) | (0x009b))))
 #define OPJ_IS_PART2(v)      ((v) & OPJ_PROFILE_PART2)
 
+#define OPJ_GET_IMF_PROFILE(v)   ((v) & 0xff00)      /** Extract IMF profile without mainlevel/sublevel */
+#define OPJ_GET_IMF_MAINLEVEL(v) ((v) & 0xf)         /** Extract IMF main level */
+#define OPJ_GET_IMF_SUBLEVEL(v)  (((v) >> 4) & 0xf)  /** Extract IMF sub level */
+
+#define OPJ_IMF_MAINLEVEL_MAX    11   /** Maximum main level */
+
+/** Max. Components Sampling Rate (MSamples/sec) per IMF main level */
+#define OPJ_IMF_MAINLEVEL_1_MSAMPLESEC   65      /** MSamples/sec for IMF main level 1 */
+#define OPJ_IMF_MAINLEVEL_2_MSAMPLESEC   130     /** MSamples/sec for IMF main level 2 */
+#define OPJ_IMF_MAINLEVEL_3_MSAMPLESEC   195     /** MSamples/sec for IMF main level 3 */
+#define OPJ_IMF_MAINLEVEL_4_MSAMPLESEC   260     /** MSamples/sec for IMF main level 4 */
+#define OPJ_IMF_MAINLEVEL_5_MSAMPLESEC   520     /** MSamples/sec for IMF main level 5 */
+#define OPJ_IMF_MAINLEVEL_6_MSAMPLESEC   1200    /** MSamples/sec for IMF main level 6 */
+#define OPJ_IMF_MAINLEVEL_7_MSAMPLESEC   2400    /** MSamples/sec for IMF main level 7 */
+#define OPJ_IMF_MAINLEVEL_8_MSAMPLESEC   4800    /** MSamples/sec for IMF main level 8 */
+#define OPJ_IMF_MAINLEVEL_9_MSAMPLESEC   9600    /** MSamples/sec for IMF main level 9 */
+#define OPJ_IMF_MAINLEVEL_10_MSAMPLESEC  19200   /** MSamples/sec for IMF main level 10 */
+#define OPJ_IMF_MAINLEVEL_11_MSAMPLESEC  38400   /** MSamples/sec for IMF main level 11 */
+
+/** Max. compressed Bit Rate (Mbits/s) per IMF sub level */
+#define OPJ_IMF_SUBLEVEL_1_MBITSSEC      200     /** Mbits/s for IMF sub level 1 */
+#define OPJ_IMF_SUBLEVEL_2_MBITSSEC      400     /** Mbits/s for IMF sub level 2 */
+#define OPJ_IMF_SUBLEVEL_3_MBITSSEC      800     /** Mbits/s for IMF sub level 3 */
+#define OPJ_IMF_SUBLEVEL_4_MBITSSEC     1600     /** Mbits/s for IMF sub level 4 */
+#define OPJ_IMF_SUBLEVEL_5_MBITSSEC     3200     /** Mbits/s for IMF sub level 5 */
+#define OPJ_IMF_SUBLEVEL_6_MBITSSEC     6400     /** Mbits/s for IMF sub level 6 */
+#define OPJ_IMF_SUBLEVEL_7_MBITSSEC    12800     /** Mbits/s for IMF sub level 7 */
+#define OPJ_IMF_SUBLEVEL_8_MBITSSEC    25600     /** Mbits/s for IMF sub level 8 */
+#define OPJ_IMF_SUBLEVEL_9_MBITSSEC    51200     /** Mbits/s for IMF sub level 9 */
+
 /**
  * JPEG 2000 codestream and component size limits in cinema profiles
  * */
@@ -318,6 +348,10 @@ typedef void (*opj_msg_callback)(const char *msg, void *client_data);
 ==========================================================
 */
 
+#ifndef OPJ_UINT32_SEMANTICALLY_BUT_INT32
+#define OPJ_UINT32_SEMANTICALLY_BUT_INT32 OPJ_INT32
+#endif
+
 /**
  * Progression order changes
  *
@@ -333,10 +367,10 @@ typedef struct opj_poc {
     OPJ_PROG_ORDER prg1, prg;
     /** Progression order string*/
     OPJ_CHAR progorder[5];
-    /** Tile number */
+    /** Tile number (starting at 1) */
     OPJ_UINT32 tile;
     /** Start and end values for Tile width and height*/
-    OPJ_INT32 tx0, tx1, ty0, ty1;
+    OPJ_UINT32_SEMANTICALLY_BUT_INT32 tx0, tx1, ty0, ty1;
     /** Start value, initialised in pi_initialise_encode*/
     OPJ_UINT32 layS, resS, compS, prcS;
     /** End value, initialised in pi_initialise_encode */
@@ -1314,15 +1348,14 @@ OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec,
  * number, or "ALL_CPUS". If OPJ_NUM_THREADS is set and this function is called,
  * this function will override the behaviour of the environment variable.
  *
- * Currently this function must be called after opj_setup_decoder() and
- * before opj_read_header().
+ * This function must be called after opj_setup_decoder() and
+ * before opj_read_header() for the decoding side, or after opj_setup_encoder()
+ * and before opj_start_compress() for the encoding side.
  *
- * Note: currently only has effect on the decompressor.
- *
- * @param p_codec       decompressor handler
+ * @param p_codec       decompressor or compressor handler
  * @param num_threads   number of threads.
  *
- * @return OPJ_TRUE     if the decoder is correctly set
+ * @return OPJ_TRUE     if the function is successful.
  */
 OPJ_API OPJ_BOOL OPJ_CALLCONV opj_codec_set_threads(opj_codec_t *p_codec,
         int num_threads);
@@ -1546,6 +1579,33 @@ OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_encoder(opj_codec_t *p_codec,
         opj_cparameters_t *parameters,
         opj_image_t *image);
 
+
+/**
+ * Specify extra options for the encoder.
+ *
+ * This may be called after opj_setup_encoder() and before opj_start_compress()
+ *
+ * This is the way to add new options in a fully ABI compatible way, without
+ * extending the opj_cparameters_t structure.
+ *
+ * Currently supported options are:
+ * <ul>
+ * <li>PLT=YES/NO. Defaults to NO. If set to YES, PLT marker segments,
+ *     indicating the length of each packet in the tile-part header, will be
+ *     written. Since 2.3.2</li>
+ * </ul>
+ *
+ * @param p_codec       Compressor handle
+ * @param p_options     Compression options. This should be a NULL terminated
+ *                      array of strings. Each string is of the form KEY=VALUE.
+ *
+ * @return OPJ_TRUE in case of success.
+ * @since 2.3.2
+ */
+OPJ_API OPJ_BOOL OPJ_CALLCONV opj_encoder_set_extra_options(
+    opj_codec_t *p_codec,
+    const char* const* p_options);
+
 /**
  * Start to compress the current image.
  * @param p_codec       Compressor handle
diff --git a/3rdparty/openjpeg/openjp2/opj_codec.h b/3rdparty/openjpeg/openjp2/opj_codec.h
index b962b12163..8a8af9119e 100644
--- a/3rdparty/openjpeg/openjp2/opj_codec.h
+++ b/3rdparty/openjpeg/openjp2/opj_codec.h
@@ -148,6 +148,11 @@ typedef struct opj_codec_private {
                                           opj_cparameters_t * p_param,
                                           struct opj_image * p_image,
                                           struct opj_event_mgr * p_manager);
+
+            OPJ_BOOL(* opj_encoder_set_extra_options)(void * p_codec,
+                    const char* const* p_options,
+                    struct opj_event_mgr * p_manager);
+
         } m_compression;
     } m_codec_data;
     /** FIXME DOC*/
diff --git a/3rdparty/openjpeg/openjp2/opj_common.h b/3rdparty/openjpeg/openjp2/opj_common.h
index a051339154..ee8adf4725 100644
--- a/3rdparty/openjpeg/openjp2/opj_common.h
+++ b/3rdparty/openjpeg/openjp2/opj_common.h
@@ -38,4 +38,10 @@
 */
 #define OPJ_COMMON_CBLK_DATA_EXTRA        2    /**< Margin for a fake FFFF marker */
 
+
+#define OPJ_COMP_PARAM_DEFAULT_CBLOCKW        64
+#define OPJ_COMP_PARAM_DEFAULT_CBLOCKH        64
+#define OPJ_COMP_PARAM_DEFAULT_PROG_ORDER     OPJ_LRCP
+#define OPJ_COMP_PARAM_DEFAULT_NUMRESOLUTION  6
+
 #endif /* OPJ_COMMMON_H */
diff --git a/3rdparty/openjpeg/openjp2/opj_intmath.h b/3rdparty/openjpeg/openjp2/opj_intmath.h
index 754b5512ff..afe69d90c0 100644
--- a/3rdparty/openjpeg/openjp2/opj_intmath.h
+++ b/3rdparty/openjpeg/openjp2/opj_intmath.h
@@ -208,6 +208,16 @@ static INLINE OPJ_INT32 opj_int_floordivpow2(OPJ_INT32 a, OPJ_INT32 b)
 {
     return a >> b;
 }
+
+/**
+Divide an integer by a power of 2 and round downwards
+@return Returns a divided by 2^b
+*/
+static INLINE OPJ_UINT32 opj_uint_floordivpow2(OPJ_UINT32 a, OPJ_UINT32 b)
+{
+    return a >> b;
+}
+
 /**
 Get logarithm of an integer and round downwards
 @return Returns log2(a)
diff --git a/3rdparty/openjpeg/openjp2/pi.c b/3rdparty/openjpeg/openjp2/pi.c
index 4a6ed68e2b..4f7dd50f16 100644
--- a/3rdparty/openjpeg/openjp2/pi.c
+++ b/3rdparty/openjpeg/openjp2/pi.c
@@ -36,6 +36,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define OPJ_UINT32_SEMANTICALLY_BUT_INT32 OPJ_UINT32
+
 #include "opj_includes.h"
 
 /** @defgroup PI PI - Implementation of a packet iterator */
@@ -91,10 +93,10 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi);
  */
 static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp,
         OPJ_UINT32 p_tileno,
-        OPJ_INT32 p_tx0,
-        OPJ_INT32 p_tx1,
-        OPJ_INT32 p_ty0,
-        OPJ_INT32 p_ty1,
+        OPJ_UINT32 p_tx0,
+        OPJ_UINT32 p_tx1,
+        OPJ_UINT32 p_ty0,
+        OPJ_UINT32 p_ty1,
         OPJ_UINT32 p_max_prec,
         OPJ_UINT32 p_max_res,
         OPJ_UINT32 p_dx_min,
@@ -118,10 +120,10 @@ static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp,
 static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp,
         OPJ_UINT32 p_num_comps,
         OPJ_UINT32 p_tileno,
-        OPJ_INT32 p_tx0,
-        OPJ_INT32 p_tx1,
-        OPJ_INT32 p_ty0,
-        OPJ_INT32 p_ty1,
+        OPJ_UINT32 p_tx0,
+        OPJ_UINT32 p_tx1,
+        OPJ_UINT32 p_ty0,
+        OPJ_UINT32 p_ty1,
         OPJ_UINT32 p_max_prec,
         OPJ_UINT32 p_max_res,
         OPJ_UINT32 p_dx_min,
@@ -144,10 +146,10 @@ static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp,
 static void opj_get_encoding_parameters(const opj_image_t *p_image,
                                         const opj_cp_t *p_cp,
                                         OPJ_UINT32  tileno,
-                                        OPJ_INT32  * p_tx0,
-                                        OPJ_INT32 * p_tx1,
-                                        OPJ_INT32 * p_ty0,
-                                        OPJ_INT32 * p_ty1,
+                                        OPJ_UINT32 * p_tx0,
+                                        OPJ_UINT32 * p_tx1,
+                                        OPJ_UINT32 * p_ty0,
+                                        OPJ_UINT32 * p_ty1,
                                         OPJ_UINT32 * p_dx_min,
                                         OPJ_UINT32 * p_dy_min,
                                         OPJ_UINT32 * p_max_prec,
@@ -176,10 +178,10 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
 static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
         const opj_cp_t *p_cp,
         OPJ_UINT32 tileno,
-        OPJ_INT32 * p_tx0,
-        OPJ_INT32 * p_tx1,
-        OPJ_INT32 * p_ty0,
-        OPJ_INT32 * p_ty1,
+        OPJ_UINT32 * p_tx0,
+        OPJ_UINT32 * p_tx1,
+        OPJ_UINT32 * p_ty0,
+        OPJ_UINT32 * p_ty1,
         OPJ_UINT32 * p_dx_min,
         OPJ_UINT32 * p_dy_min,
         OPJ_UINT32 * p_max_prec,
@@ -192,10 +194,12 @@ static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
  * @param   p_image     the image used to initialize the packet iterator (in fact only the number of components is relevant.
  * @param   p_cp        the coding parameters.
  * @param   tileno  the index of the tile from which creating the packet iterator.
+ * @param   manager Event manager
  */
 static opj_pi_iterator_t * opj_pi_create(const opj_image_t *p_image,
         const opj_cp_t *p_cp,
-        OPJ_UINT32 tileno);
+        OPJ_UINT32 tileno,
+        opj_event_mgr_t* manager);
 /**
  * FIXME DOC
  */
@@ -230,18 +234,19 @@ static OPJ_BOOL opj_pi_check_next_level(OPJ_INT32 pos,
 ==========================================================
 */
 
-static void opj_pi_emit_error(opj_pi_iterator_t * pi, const char* msg)
-{
-    (void)pi;
-    (void)msg;
-}
-
 static OPJ_BOOL opj_pi_next_lrcp(opj_pi_iterator_t * pi)
 {
     opj_pi_comp_t *comp = NULL;
     opj_pi_resolution_t *res = NULL;
     OPJ_UINT32 index = 0;
 
+    if (pi->poc.compno0 >= pi->numcomps ||
+            pi->poc.compno1 >= pi->numcomps + 1) {
+        opj_event_msg(pi->manager, EVT_ERROR,
+                      "opj_pi_next_lrcp(): invalid compno0/compno1\n");
+        return OPJ_FALSE;
+    }
+
     if (!pi->first) {
         comp = &pi->comps[pi->compno];
         res = &comp->resolutions[pi->resno];
@@ -272,7 +277,7 @@ static OPJ_BOOL opj_pi_next_lrcp(opj_pi_iterator_t * pi)
                     /* include should be resized when a POC arises, or */
                     /* the POC should be rejected */
                     if (index >= pi->include_size) {
-                        opj_pi_emit_error(pi, "Invalid access to pi->include");
+                        opj_event_msg(pi->manager, EVT_ERROR, "Invalid access to pi->include");
                         return OPJ_FALSE;
                     }
                     if (!pi->include[index]) {
@@ -295,6 +300,13 @@ static OPJ_BOOL opj_pi_next_rlcp(opj_pi_iterator_t * pi)
     opj_pi_resolution_t *res = NULL;
     OPJ_UINT32 index = 0;
 
+    if (pi->poc.compno0 >= pi->numcomps ||
+            pi->poc.compno1 >= pi->numcomps + 1) {
+        opj_event_msg(pi->manager, EVT_ERROR,
+                      "opj_pi_next_rlcp(): invalid compno0/compno1\n");
+        return OPJ_FALSE;
+    }
+
     if (!pi->first) {
         comp = &pi->comps[pi->compno];
         res = &comp->resolutions[pi->resno];
@@ -318,7 +330,7 @@ static OPJ_BOOL opj_pi_next_rlcp(opj_pi_iterator_t * pi)
                     index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno *
                             pi->step_c + pi->precno * pi->step_p;
                     if (index >= pi->include_size) {
-                        opj_pi_emit_error(pi, "Invalid access to pi->include");
+                        opj_event_msg(pi->manager, EVT_ERROR, "Invalid access to pi->include");
                         return OPJ_FALSE;
                     }
                     if (!pi->include[index]) {
@@ -341,6 +353,13 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
     opj_pi_resolution_t *res = NULL;
     OPJ_UINT32 index = 0;
 
+    if (pi->poc.compno0 >= pi->numcomps ||
+            pi->poc.compno1 >= pi->numcomps + 1) {
+        opj_event_msg(pi->manager, EVT_ERROR,
+                      "opj_pi_next_rpcl(): invalid compno0/compno1\n");
+        return OPJ_FALSE;
+    }
+
     if (!pi->first) {
         goto LABEL_SKIP;
     } else {
@@ -376,16 +395,16 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
         pi->poc.tx1 = pi->tx1;
     }
     for (pi->resno = pi->poc.resno0; pi->resno < pi->poc.resno1; pi->resno++) {
-        for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1;
-                pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) {
-            for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1;
-                    pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) {
+        for (pi->y = (OPJ_UINT32)pi->poc.ty0; pi->y < (OPJ_UINT32)pi->poc.ty1;
+                pi->y += (pi->dy - (pi->y % pi->dy))) {
+            for (pi->x = (OPJ_UINT32)pi->poc.tx0; pi->x < (OPJ_UINT32)pi->poc.tx1;
+                    pi->x += (pi->dx - (pi->x % pi->dx))) {
                 for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) {
                     OPJ_UINT32 levelno;
-                    OPJ_INT32 trx0, try0;
-                    OPJ_INT32  trx1, try1;
+                    OPJ_UINT32 trx0, try0;
+                    OPJ_UINT32  trx1, try1;
                     OPJ_UINT32  rpx, rpy;
-                    OPJ_INT32  prci, prcj;
+                    OPJ_UINT32  prci, prcj;
                     comp = &pi->comps[pi->compno];
                     if (pi->resno >= comp->numresolutions) {
                         continue;
@@ -404,10 +423,10 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
                             (comp->dy << levelno) > INT_MAX) {
                         continue;
                     }
-                    trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno));
-                    try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno));
-                    trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno));
-                    try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno));
+                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
+                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
+                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
+                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
@@ -421,12 +440,12 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
                     }
 
                     /* See ISO-15441. B.12.1.3 Resolution level-position-component-layer progression */
-                    if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1 << rpy))))) {
+                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
+                            ((try0 << levelno) % (1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1 << rpx))))) {
+                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
+                            ((trx0 << levelno) % (1U << rpx))))) {
                         continue;
                     }
 
@@ -438,18 +457,18 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x,
-                                                (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx)
-                           - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx);
-                    prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y,
-                                                (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy)
-                           - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy);
-                    pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw);
+                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
+                                                 (comp->dx << levelno)), res->pdx)
+                           - opj_uint_floordivpow2(trx0, res->pdx);
+                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
+                                                 (comp->dy << levelno)), res->pdy)
+                           - opj_uint_floordivpow2(try0, res->pdy);
+                    pi->precno = prci + prcj * res->pw;
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
                         index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno *
                                 pi->step_c + pi->precno * pi->step_p;
                         if (index >= pi->include_size) {
-                            opj_pi_emit_error(pi, "Invalid access to pi->include");
+                            opj_event_msg(pi->manager, EVT_ERROR, "Invalid access to pi->include");
                             return OPJ_FALSE;
                         }
                         if (!pi->include[index]) {
@@ -473,6 +492,13 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
     opj_pi_resolution_t *res = NULL;
     OPJ_UINT32 index = 0;
 
+    if (pi->poc.compno0 >= pi->numcomps ||
+            pi->poc.compno1 >= pi->numcomps + 1) {
+        opj_event_msg(pi->manager, EVT_ERROR,
+                      "opj_pi_next_pcrl(): invalid compno0/compno1\n");
+        return OPJ_FALSE;
+    }
+
     if (!pi->first) {
         comp = &pi->comps[pi->compno];
         goto LABEL_SKIP;
@@ -508,19 +534,19 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
         pi->poc.ty1 = pi->ty1;
         pi->poc.tx1 = pi->tx1;
     }
-    for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1;
-            pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) {
-        for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1;
-                pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) {
+    for (pi->y = (OPJ_UINT32)pi->poc.ty0; pi->y < (OPJ_UINT32)pi->poc.ty1;
+            pi->y += (pi->dy - (pi->y % pi->dy))) {
+        for (pi->x = (OPJ_UINT32)pi->poc.tx0; pi->x < (OPJ_UINT32)pi->poc.tx1;
+                pi->x += (pi->dx - (pi->x % pi->dx))) {
             for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) {
                 comp = &pi->comps[pi->compno];
                 for (pi->resno = pi->poc.resno0;
                         pi->resno < opj_uint_min(pi->poc.resno1, comp->numresolutions); pi->resno++) {
                     OPJ_UINT32 levelno;
-                    OPJ_INT32 trx0, try0;
-                    OPJ_INT32 trx1, try1;
+                    OPJ_UINT32 trx0, try0;
+                    OPJ_UINT32 trx1, try1;
                     OPJ_UINT32 rpx, rpy;
-                    OPJ_INT32 prci, prcj;
+                    OPJ_UINT32 prci, prcj;
                     res = &comp->resolutions[pi->resno];
                     levelno = comp->numresolutions - 1 - pi->resno;
                     /* Avoids division by zero */
@@ -535,10 +561,10 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
                             (comp->dy << levelno) > INT_MAX) {
                         continue;
                     }
-                    trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno));
-                    try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno));
-                    trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno));
-                    try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno));
+                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
+                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
+                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
+                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
@@ -552,12 +578,12 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
                     }
 
                     /* See ISO-15441. B.12.1.4 Position-component-resolution level-layer progression */
-                    if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1 << rpy))))) {
+                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
+                            ((try0 << levelno) % (1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1 << rpx))))) {
+                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
+                            ((trx0 << levelno) % (1U << rpx))))) {
                         continue;
                     }
 
@@ -569,18 +595,18 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x,
-                                                (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx)
-                           - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx);
-                    prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y,
-                                                (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy)
-                           - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy);
-                    pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw);
+                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
+                                                 (comp->dx << levelno)), res->pdx)
+                           - opj_uint_floordivpow2(trx0, res->pdx);
+                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
+                                                 (comp->dy << levelno)), res->pdy)
+                           - opj_uint_floordivpow2(try0, res->pdy);
+                    pi->precno = prci + prcj * res->pw;
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
                         index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno *
                                 pi->step_c + pi->precno * pi->step_p;
                         if (index >= pi->include_size) {
-                            opj_pi_emit_error(pi, "Invalid access to pi->include");
+                            opj_event_msg(pi->manager, EVT_ERROR, "Invalid access to pi->include");
                             return OPJ_FALSE;
                         }
                         if (!pi->include[index]) {
@@ -604,6 +630,13 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
     opj_pi_resolution_t *res = NULL;
     OPJ_UINT32 index = 0;
 
+    if (pi->poc.compno0 >= pi->numcomps ||
+            pi->poc.compno1 >= pi->numcomps + 1) {
+        opj_event_msg(pi->manager, EVT_ERROR,
+                      "opj_pi_next_cprl(): invalid compno0/compno1\n");
+        return OPJ_FALSE;
+    }
+
     if (!pi->first) {
         comp = &pi->comps[pi->compno];
         goto LABEL_SKIP;
@@ -639,17 +672,17 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
             pi->poc.ty1 = pi->ty1;
             pi->poc.tx1 = pi->tx1;
         }
-        for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1;
-                pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) {
-            for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1;
-                    pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) {
+        for (pi->y = (OPJ_UINT32)pi->poc.ty0; pi->y < (OPJ_UINT32)pi->poc.ty1;
+                pi->y += (pi->dy - (pi->y % pi->dy))) {
+            for (pi->x = (OPJ_UINT32)pi->poc.tx0; pi->x < (OPJ_UINT32)pi->poc.tx1;
+                    pi->x += (pi->dx - (pi->x % pi->dx))) {
                 for (pi->resno = pi->poc.resno0;
                         pi->resno < opj_uint_min(pi->poc.resno1, comp->numresolutions); pi->resno++) {
                     OPJ_UINT32 levelno;
-                    OPJ_INT32 trx0, try0;
-                    OPJ_INT32 trx1, try1;
+                    OPJ_UINT32 trx0, try0;
+                    OPJ_UINT32 trx1, try1;
                     OPJ_UINT32 rpx, rpy;
-                    OPJ_INT32 prci, prcj;
+                    OPJ_UINT32 prci, prcj;
                     res = &comp->resolutions[pi->resno];
                     levelno = comp->numresolutions - 1 - pi->resno;
                     /* Avoids division by zero on id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */
@@ -663,10 +696,10 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
                             (comp->dy << levelno) > INT_MAX) {
                         continue;
                     }
-                    trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno));
-                    try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno));
-                    trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno));
-                    try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno));
+                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
+                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
+                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
+                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
@@ -680,12 +713,12 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
                     }
 
                     /* See ISO-15441. B.12.1.5 Component-position-resolution level-layer progression */
-                    if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1 << rpy))))) {
+                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
+                            ((try0 << levelno) % (1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1 << rpx))))) {
+                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
+                            ((trx0 << levelno) % (1U << rpx))))) {
                         continue;
                     }
 
@@ -697,18 +730,18 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x,
-                                                (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx)
-                           - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx);
-                    prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y,
-                                                (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy)
-                           - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy);
-                    pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw);
+                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
+                                                 (comp->dx << levelno)), res->pdx)
+                           - opj_uint_floordivpow2(trx0, res->pdx);
+                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
+                                                 (comp->dy << levelno)), res->pdy)
+                           - opj_uint_floordivpow2(try0, res->pdy);
+                    pi->precno = (OPJ_UINT32)(prci + prcj * res->pw);
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
                         index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno *
                                 pi->step_c + pi->precno * pi->step_p;
                         if (index >= pi->include_size) {
-                            opj_pi_emit_error(pi, "Invalid access to pi->include");
+                            opj_event_msg(pi->manager, EVT_ERROR, "Invalid access to pi->include");
                             return OPJ_FALSE;
                         }
                         if (!pi->include[index]) {
@@ -729,10 +762,10 @@ LABEL_SKIP:
 static void opj_get_encoding_parameters(const opj_image_t *p_image,
                                         const opj_cp_t *p_cp,
                                         OPJ_UINT32 p_tileno,
-                                        OPJ_INT32 * p_tx0,
-                                        OPJ_INT32  * p_tx1,
-                                        OPJ_INT32  * p_ty0,
-                                        OPJ_INT32  * p_ty1,
+                                        OPJ_UINT32 * p_tx0,
+                                        OPJ_UINT32  * p_tx1,
+                                        OPJ_UINT32  * p_ty0,
+                                        OPJ_UINT32  * p_ty1,
                                         OPJ_UINT32 * p_dx_min,
                                         OPJ_UINT32 * p_dy_min,
                                         OPJ_UINT32 * p_max_prec,
@@ -768,12 +801,12 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
     /* find extent of tile */
     l_tx0 = p_cp->tx0 + p *
             p_cp->tdx; /* can't be greater than p_image->x1 so won't overflow */
-    *p_tx0 = (OPJ_INT32)opj_uint_max(l_tx0, p_image->x0);
-    *p_tx1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_tx0, p_cp->tdx), p_image->x1);
+    *p_tx0 = opj_uint_max(l_tx0, p_image->x0);
+    *p_tx1 = opj_uint_min(opj_uint_adds(l_tx0, p_cp->tdx), p_image->x1);
     l_ty0 = p_cp->ty0 + q *
             p_cp->tdy; /* can't be greater than p_image->y1 so won't overflow */
-    *p_ty0 = (OPJ_INT32)opj_uint_max(l_ty0, p_image->y0);
-    *p_ty1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_ty0, p_cp->tdy), p_image->y1);
+    *p_ty0 = opj_uint_max(l_ty0, p_image->y0);
+    *p_ty1 = opj_uint_min(opj_uint_adds(l_ty0, p_cp->tdy), p_image->y1);
 
     /* max precision is 0 (can only grow) */
     *p_max_prec = 0;
@@ -786,17 +819,17 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
     for (compno = 0; compno < p_image->numcomps; ++compno) {
         /* arithmetic variables to calculate */
         OPJ_UINT32 l_level_no;
-        OPJ_INT32 l_rx0, l_ry0, l_rx1, l_ry1;
-        OPJ_INT32 l_px0, l_py0, l_px1, py1;
+        OPJ_UINT32 l_rx0, l_ry0, l_rx1, l_ry1;
+        OPJ_UINT32 l_px0, l_py0, l_px1, py1;
         OPJ_UINT32 l_pdx, l_pdy;
         OPJ_UINT32 l_pw, l_ph;
         OPJ_UINT32 l_product;
-        OPJ_INT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1;
+        OPJ_UINT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1;
 
-        l_tcx0 = opj_int_ceildiv(*p_tx0, (OPJ_INT32)l_img_comp->dx);
-        l_tcy0 = opj_int_ceildiv(*p_ty0, (OPJ_INT32)l_img_comp->dy);
-        l_tcx1 = opj_int_ceildiv(*p_tx1, (OPJ_INT32)l_img_comp->dx);
-        l_tcy1 = opj_int_ceildiv(*p_ty1, (OPJ_INT32)l_img_comp->dy);
+        l_tcx0 = opj_uint_ceildiv(*p_tx0, l_img_comp->dx);
+        l_tcy0 = opj_uint_ceildiv(*p_ty0, l_img_comp->dy);
+        l_tcx1 = opj_uint_ceildiv(*p_tx1, l_img_comp->dx);
+        l_tcy1 = opj_uint_ceildiv(*p_ty1, l_img_comp->dy);
 
         if (l_tccp->numresolutions > *p_max_res) {
             *p_max_res = l_tccp->numresolutions;
@@ -820,19 +853,19 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
             /* various calculations of extents */
             l_level_no = l_tccp->numresolutions - 1 - resno;
 
-            l_rx0 = opj_int_ceildivpow2(l_tcx0, (OPJ_INT32)l_level_no);
-            l_ry0 = opj_int_ceildivpow2(l_tcy0, (OPJ_INT32)l_level_no);
-            l_rx1 = opj_int_ceildivpow2(l_tcx1, (OPJ_INT32)l_level_no);
-            l_ry1 = opj_int_ceildivpow2(l_tcy1, (OPJ_INT32)l_level_no);
+            l_rx0 = opj_uint_ceildivpow2(l_tcx0, l_level_no);
+            l_ry0 = opj_uint_ceildivpow2(l_tcy0, l_level_no);
+            l_rx1 = opj_uint_ceildivpow2(l_tcx1, l_level_no);
+            l_ry1 = opj_uint_ceildivpow2(l_tcy1, l_level_no);
 
-            l_px0 = opj_int_floordivpow2(l_rx0, (OPJ_INT32)l_pdx) << l_pdx;
-            l_py0 = opj_int_floordivpow2(l_ry0, (OPJ_INT32)l_pdy) << l_pdy;
-            l_px1 = opj_int_ceildivpow2(l_rx1, (OPJ_INT32)l_pdx) << l_pdx;
+            l_px0 = opj_uint_floordivpow2(l_rx0, l_pdx) << l_pdx;
+            l_py0 = opj_uint_floordivpow2(l_ry0, l_pdy) << l_pdy;
+            l_px1 = opj_uint_ceildivpow2(l_rx1, l_pdx) << l_pdx;
 
-            py1 = opj_int_ceildivpow2(l_ry1, (OPJ_INT32)l_pdy) << l_pdy;
+            py1 = opj_uint_ceildivpow2(l_ry1, l_pdy) << l_pdy;
 
-            l_pw = (l_rx0 == l_rx1) ? 0 : (OPJ_UINT32)((l_px1 - l_px0) >> l_pdx);
-            l_ph = (l_ry0 == l_ry1) ? 0 : (OPJ_UINT32)((py1 - l_py0) >> l_pdy);
+            l_pw = (l_rx0 == l_rx1) ? 0 : ((l_px1 - l_px0) >> l_pdx);
+            l_ph = (l_ry0 == l_ry1) ? 0 : ((py1 - l_py0) >> l_pdy);
 
             l_product = l_pw * l_ph;
 
@@ -850,10 +883,10 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
 static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
         const opj_cp_t *p_cp,
         OPJ_UINT32 tileno,
-        OPJ_INT32 * p_tx0,
-        OPJ_INT32 * p_tx1,
-        OPJ_INT32 * p_ty0,
-        OPJ_INT32 * p_ty1,
+        OPJ_UINT32 * p_tx0,
+        OPJ_UINT32 * p_tx1,
+        OPJ_UINT32 * p_ty0,
+        OPJ_UINT32 * p_ty1,
         OPJ_UINT32 * p_dx_min,
         OPJ_UINT32 * p_dy_min,
         OPJ_UINT32 * p_max_prec,
@@ -894,12 +927,12 @@ static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
     /* here calculation of tx0, tx1, ty0, ty1, maxprec, l_dx and l_dy */
     l_tx0 = p_cp->tx0 + p *
             p_cp->tdx; /* can't be greater than p_image->x1 so won't overflow */
-    *p_tx0 = (OPJ_INT32)opj_uint_max(l_tx0, p_image->x0);
-    *p_tx1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_tx0, p_cp->tdx), p_image->x1);
+    *p_tx0 = opj_uint_max(l_tx0, p_image->x0);
+    *p_tx1 = opj_uint_min(opj_uint_adds(l_tx0, p_cp->tdx), p_image->x1);
     l_ty0 = p_cp->ty0 + q *
             p_cp->tdy; /* can't be greater than p_image->y1 so won't overflow */
-    *p_ty0 = (OPJ_INT32)opj_uint_max(l_ty0, p_image->y0);
-    *p_ty1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_ty0, p_cp->tdy), p_image->y1);
+    *p_ty0 = opj_uint_max(l_ty0, p_image->y0);
+    *p_ty1 = opj_uint_min(opj_uint_adds(l_ty0, p_cp->tdy), p_image->y1);
 
     /* max precision and resolution is 0 (can only grow)*/
     *p_max_prec = 0;
@@ -912,18 +945,18 @@ static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
     for (compno = 0; compno < p_image->numcomps; ++compno) {
         /* aritmetic variables to calculate*/
         OPJ_UINT32 l_level_no;
-        OPJ_INT32 l_rx0, l_ry0, l_rx1, l_ry1;
-        OPJ_INT32 l_px0, l_py0, l_px1, py1;
+        OPJ_UINT32 l_rx0, l_ry0, l_rx1, l_ry1;
+        OPJ_UINT32 l_px0, l_py0, l_px1, py1;
         OPJ_UINT32 l_product;
-        OPJ_INT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1;
+        OPJ_UINT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1;
         OPJ_UINT32 l_pdx, l_pdy, l_pw, l_ph;
 
-        lResolutionPtr = p_resolutions[compno];
+        lResolutionPtr = p_resolutions ? p_resolutions[compno] : NULL;
 
-        l_tcx0 = opj_int_ceildiv(*p_tx0, (OPJ_INT32)l_img_comp->dx);
-        l_tcy0 = opj_int_ceildiv(*p_ty0, (OPJ_INT32)l_img_comp->dy);
-        l_tcx1 = opj_int_ceildiv(*p_tx1, (OPJ_INT32)l_img_comp->dx);
-        l_tcy1 = opj_int_ceildiv(*p_ty1, (OPJ_INT32)l_img_comp->dy);
+        l_tcx0 = opj_uint_ceildiv(*p_tx0, l_img_comp->dx);
+        l_tcy0 = opj_uint_ceildiv(*p_ty0, l_img_comp->dy);
+        l_tcx1 = opj_uint_ceildiv(*p_tx1, l_img_comp->dx);
+        l_tcy1 = opj_uint_ceildiv(*p_ty1, l_img_comp->dy);
 
         if (l_tccp->numresolutions > *p_max_res) {
             *p_max_res = l_tccp->numresolutions;
@@ -939,33 +972,37 @@ static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
             /* precinct width and height*/
             l_pdx = l_tccp->prcw[resno];
             l_pdy = l_tccp->prch[resno];
-            *lResolutionPtr++ = l_pdx;
-            *lResolutionPtr++ = l_pdy;
+            if (lResolutionPtr) {
+                *lResolutionPtr++ = l_pdx;
+                *lResolutionPtr++ = l_pdy;
+            }
             if (l_pdx + l_level_no < 32 &&
                     l_img_comp->dx <= UINT_MAX / (1u << (l_pdx + l_level_no))) {
                 l_dx = l_img_comp->dx * (1u << (l_pdx + l_level_no));
                 /* take the minimum size for l_dx for each comp and resolution*/
-                *p_dx_min = (OPJ_UINT32)opj_int_min((OPJ_INT32) * p_dx_min, (OPJ_INT32)l_dx);
+                *p_dx_min = opj_uint_min(*p_dx_min, l_dx);
             }
             if (l_pdy + l_level_no < 32 &&
                     l_img_comp->dy <= UINT_MAX / (1u << (l_pdy + l_level_no))) {
                 l_dy = l_img_comp->dy * (1u << (l_pdy + l_level_no));
-                *p_dy_min = (OPJ_UINT32)opj_int_min((OPJ_INT32) * p_dy_min, (OPJ_INT32)l_dy);
+                *p_dy_min = opj_uint_min(*p_dy_min, l_dy);
             }
 
             /* various calculations of extents*/
-            l_rx0 = opj_int_ceildivpow2(l_tcx0, (OPJ_INT32)l_level_no);
-            l_ry0 = opj_int_ceildivpow2(l_tcy0, (OPJ_INT32)l_level_no);
-            l_rx1 = opj_int_ceildivpow2(l_tcx1, (OPJ_INT32)l_level_no);
-            l_ry1 = opj_int_ceildivpow2(l_tcy1, (OPJ_INT32)l_level_no);
-            l_px0 = opj_int_floordivpow2(l_rx0, (OPJ_INT32)l_pdx) << l_pdx;
-            l_py0 = opj_int_floordivpow2(l_ry0, (OPJ_INT32)l_pdy) << l_pdy;
-            l_px1 = opj_int_ceildivpow2(l_rx1, (OPJ_INT32)l_pdx) << l_pdx;
-            py1 = opj_int_ceildivpow2(l_ry1, (OPJ_INT32)l_pdy) << l_pdy;
-            l_pw = (l_rx0 == l_rx1) ? 0 : (OPJ_UINT32)((l_px1 - l_px0) >> l_pdx);
-            l_ph = (l_ry0 == l_ry1) ? 0 : (OPJ_UINT32)((py1 - l_py0) >> l_pdy);
-            *lResolutionPtr++ = l_pw;
-            *lResolutionPtr++ = l_ph;
+            l_rx0 = opj_uint_ceildivpow2(l_tcx0, l_level_no);
+            l_ry0 = opj_uint_ceildivpow2(l_tcy0, l_level_no);
+            l_rx1 = opj_uint_ceildivpow2(l_tcx1, l_level_no);
+            l_ry1 = opj_uint_ceildivpow2(l_tcy1, l_level_no);
+            l_px0 = opj_uint_floordivpow2(l_rx0, l_pdx) << l_pdx;
+            l_py0 = opj_uint_floordivpow2(l_ry0, l_pdy) << l_pdy;
+            l_px1 = opj_uint_ceildivpow2(l_rx1, l_pdx) << l_pdx;
+            py1 = opj_uint_ceildivpow2(l_ry1, l_pdy) << l_pdy;
+            l_pw = (l_rx0 == l_rx1) ? 0 : ((l_px1 - l_px0) >> l_pdx);
+            l_ph = (l_ry0 == l_ry1) ? 0 : ((py1 - l_py0) >> l_pdy);
+            if (lResolutionPtr) {
+                *lResolutionPtr++ = l_pw;
+                *lResolutionPtr++ = l_ph;
+            }
             l_product = l_pw * l_ph;
 
             /* update precision*/
@@ -981,7 +1018,8 @@ static void opj_get_all_encoding_parameters(const opj_image_t *p_image,
 
 static opj_pi_iterator_t * opj_pi_create(const opj_image_t *image,
         const opj_cp_t *cp,
-        OPJ_UINT32 tileno)
+        OPJ_UINT32 tileno,
+        opj_event_mgr_t* manager)
 {
     /* loop*/
     OPJ_UINT32 pino, compno;
@@ -1015,6 +1053,8 @@ static opj_pi_iterator_t * opj_pi_create(const opj_image_t *image,
     l_current_pi = l_pi;
     for (pino = 0; pino < l_poc_bound ; ++pino) {
 
+        l_current_pi->manager = manager;
+
         l_current_pi->comps = (opj_pi_comp_t*) opj_calloc(image->numcomps,
                               sizeof(opj_pi_comp_t));
         if (! l_current_pi->comps) {
@@ -1045,10 +1085,10 @@ static opj_pi_iterator_t * opj_pi_create(const opj_image_t *image,
 
 static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp,
         OPJ_UINT32 p_tileno,
-        OPJ_INT32 p_tx0,
-        OPJ_INT32 p_tx1,
-        OPJ_INT32 p_ty0,
-        OPJ_INT32 p_ty1,
+        OPJ_UINT32 p_tx0,
+        OPJ_UINT32 p_tx1,
+        OPJ_UINT32 p_ty0,
+        OPJ_UINT32 p_ty1,
         OPJ_UINT32 p_max_prec,
         OPJ_UINT32 p_max_res,
         OPJ_UINT32 p_dx_min,
@@ -1125,10 +1165,10 @@ static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp,
 static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp,
         OPJ_UINT32 p_num_comps,
         OPJ_UINT32 p_tileno,
-        OPJ_INT32 p_tx0,
-        OPJ_INT32 p_tx1,
-        OPJ_INT32 p_ty0,
-        OPJ_INT32 p_ty1,
+        OPJ_UINT32 p_tx0,
+        OPJ_UINT32 p_tx1,
+        OPJ_UINT32 p_ty0,
+        OPJ_UINT32 p_ty1,
         OPJ_UINT32 p_max_prec,
         OPJ_UINT32 p_max_res,
         OPJ_UINT32 p_dx_min,
@@ -1167,10 +1207,10 @@ static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp,
         l_current_poc->prg  = l_tcp->prg;
         l_current_poc->prcS = 0;
         l_current_poc->prcE = p_max_prec;
-        l_current_poc->txS = (OPJ_UINT32)p_tx0;
-        l_current_poc->txE = (OPJ_UINT32)p_tx1;
-        l_current_poc->tyS = (OPJ_UINT32)p_ty0;
-        l_current_poc->tyE = (OPJ_UINT32)p_ty1;
+        l_current_poc->txS = p_tx0;
+        l_current_poc->txE = p_tx1;
+        l_current_poc->tyS = p_ty0;
+        l_current_poc->tyE = p_ty1;
         l_current_poc->dx = p_dx_min;
         l_current_poc->dy = p_dy_min;
         ++ l_current_poc;
@@ -1352,7 +1392,8 @@ static OPJ_BOOL opj_pi_check_next_level(OPJ_INT32 pos,
 */
 opj_pi_iterator_t *opj_pi_create_decode(opj_image_t *p_image,
                                         opj_cp_t *p_cp,
-                                        OPJ_UINT32 p_tile_no)
+                                        OPJ_UINT32 p_tile_no,
+                                        opj_event_mgr_t* manager)
 {
     OPJ_UINT32 numcomps = p_image->numcomps;
 
@@ -1367,7 +1408,7 @@ opj_pi_iterator_t *opj_pi_create_decode(opj_image_t *p_image,
     /* encoding prameters to set */
     OPJ_UINT32 l_max_res;
     OPJ_UINT32 l_max_prec;
-    OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1;
+    OPJ_UINT32 l_tx0, l_tx1, l_ty0, l_ty1;
     OPJ_UINT32 l_dx_min, l_dy_min;
     OPJ_UINT32 l_bound;
     OPJ_UINT32 l_step_p, l_step_c, l_step_r, l_step_l ;
@@ -1407,7 +1448,7 @@ opj_pi_iterator_t *opj_pi_create_decode(opj_image_t *p_image,
     }
 
     /* memory allocation for pi */
-    l_pi = opj_pi_create(p_image, p_cp, p_tile_no);
+    l_pi = opj_pi_create(p_image, p_cp, p_tile_no, manager);
     if (!l_pi) {
         opj_free(l_tmp_data);
         opj_free(l_tmp_ptr);
@@ -1548,11 +1589,34 @@ opj_pi_iterator_t *opj_pi_create_decode(opj_image_t *p_image,
 }
 
 
+OPJ_UINT32 opj_get_encoding_packet_count(const opj_image_t *p_image,
+        const opj_cp_t *p_cp,
+        OPJ_UINT32 p_tile_no)
+{
+    OPJ_UINT32 l_max_res;
+    OPJ_UINT32 l_max_prec;
+    OPJ_UINT32 l_tx0, l_tx1, l_ty0, l_ty1;
+    OPJ_UINT32 l_dx_min, l_dy_min;
+
+    /* preconditions in debug*/
+    assert(p_cp != 00);
+    assert(p_image != 00);
+    assert(p_tile_no < p_cp->tw * p_cp->th);
+
+    /* get encoding parameters*/
+    opj_get_all_encoding_parameters(p_image, p_cp, p_tile_no, &l_tx0, &l_tx1,
+                                    &l_ty0, &l_ty1, &l_dx_min, &l_dy_min, &l_max_prec, &l_max_res, NULL);
+
+    return p_cp->tcps[p_tile_no].numlayers * l_max_prec * p_image->numcomps *
+           l_max_res;
+}
+
 
 opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *p_image,
         opj_cp_t *p_cp,
         OPJ_UINT32 p_tile_no,
-        J2K_T2_MODE p_t2_mode)
+        J2K_T2_MODE p_t2_mode,
+        opj_event_mgr_t* manager)
 {
     OPJ_UINT32 numcomps = p_image->numcomps;
 
@@ -1567,7 +1631,7 @@ opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *p_image,
     /* encoding prameters to set*/
     OPJ_UINT32 l_max_res;
     OPJ_UINT32 l_max_prec;
-    OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1;
+    OPJ_UINT32 l_tx0, l_tx1, l_ty0, l_ty1;
     OPJ_UINT32 l_dx_min, l_dy_min;
     OPJ_UINT32 l_bound;
     OPJ_UINT32 l_step_p, l_step_c, l_step_r, l_step_l ;
@@ -1606,7 +1670,7 @@ opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *p_image,
     }
 
     /* memory allocation for pi*/
-    l_pi = opj_pi_create(p_image, p_cp, p_tile_no);
+    l_pi = opj_pi_create(p_image, p_cp, p_tile_no, manager);
     if (!l_pi) {
         opj_free(l_tmp_data);
         opj_free(l_tmp_ptr);
@@ -1761,7 +1825,8 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
     pi[pino].poc.prg = tcp->prg;
 
     if (!(cp->m_specific_param.m_enc.m_tp_on && ((!OPJ_IS_CINEMA(cp->rsiz) &&
-            (t2_mode == FINAL_PASS)) || OPJ_IS_CINEMA(cp->rsiz)))) {
+            !OPJ_IS_IMF(cp->rsiz) &&
+            (t2_mode == FINAL_PASS)) || OPJ_IS_CINEMA(cp->rsiz) || OPJ_IS_IMF(cp->rsiz)))) {
         pi[pino].poc.resno0 = tcp->resS;
         pi[pino].poc.resno1 = tcp->resE;
         pi[pino].poc.compno0 = tcp->compS;
@@ -1770,10 +1835,10 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
         pi[pino].poc.layno1 = tcp->layE;
         pi[pino].poc.precno0 = tcp->prcS;
         pi[pino].poc.precno1 = tcp->prcE;
-        pi[pino].poc.tx0 = (OPJ_INT32)tcp->txS;
-        pi[pino].poc.ty0 = (OPJ_INT32)tcp->tyS;
-        pi[pino].poc.tx1 = (OPJ_INT32)tcp->txE;
-        pi[pino].poc.ty1 = (OPJ_INT32)tcp->tyE;
+        pi[pino].poc.tx0 = tcp->txS;
+        pi[pino].poc.ty0 = tcp->tyS;
+        pi[pino].poc.tx1 = tcp->txE;
+        pi[pino].poc.ty1 = tcp->tyE;
     } else {
         for (i = tppos + 1; i < 4; i++) {
             switch (prog[i]) {
@@ -1797,10 +1862,10 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
                     pi[pino].poc.precno1 = tcp->prcE;
                     break;
                 default:
-                    pi[pino].poc.tx0 = (OPJ_INT32)tcp->txS;
-                    pi[pino].poc.ty0 = (OPJ_INT32)tcp->tyS;
-                    pi[pino].poc.tx1 = (OPJ_INT32)tcp->txE;
-                    pi[pino].poc.ty1 = (OPJ_INT32)tcp->tyE;
+                    pi[pino].poc.tx0 = tcp->txS;
+                    pi[pino].poc.ty0 = tcp->tyS;
+                    pi[pino].poc.tx1 = tcp->txE;
+                    pi[pino].poc.ty1 = tcp->tyE;
                     break;
                 }
                 break;
@@ -1840,10 +1905,10 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
                     default:
                         tcp->tx0_t = tcp->txS;
                         tcp->ty0_t = tcp->tyS;
-                        pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t;
-                        pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx));
-                        pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t;
-                        pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy));
+                        pi[pino].poc.tx0 = tcp->tx0_t;
+                        pi[pino].poc.tx1 = tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx);
+                        pi[pino].poc.ty0 = tcp->ty0_t;
+                        pi[pino].poc.ty1 = tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy);
                         tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1;
                         tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1;
                         break;
@@ -1875,10 +1940,10 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
                         pi[pino].poc.precno1 = tcp->prc_t;
                         break;
                     default:
-                        pi[pino].poc.tx0 = (OPJ_INT32)(tcp->tx0_t - tcp->dx - (tcp->tx0_t % tcp->dx));
-                        pi[pino].poc.tx1 = (OPJ_INT32)tcp->tx0_t ;
-                        pi[pino].poc.ty0 = (OPJ_INT32)(tcp->ty0_t - tcp->dy - (tcp->ty0_t % tcp->dy));
-                        pi[pino].poc.ty1 = (OPJ_INT32)tcp->ty0_t ;
+                        pi[pino].poc.tx0 = tcp->tx0_t - tcp->dx - (tcp->tx0_t % tcp->dx);
+                        pi[pino].poc.tx1 = tcp->tx0_t ;
+                        pi[pino].poc.ty0 = tcp->ty0_t - tcp->dy - (tcp->ty0_t % tcp->dy);
+                        pi[pino].poc.ty1 = tcp->ty0_t ;
                         break;
                     }
                     break;
@@ -1965,8 +2030,8 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
                                 if (tcp->ty0_t >= tcp->tyE) {
                                     if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) {
                                         tcp->ty0_t = tcp->tyS;
-                                        pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t;
-                                        pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy));
+                                        pi[pino].poc.ty0 = tcp->ty0_t;
+                                        pi[pino].poc.ty1 = tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy);
                                         tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1;
                                         incr_top = 1;
                                         resetX = 1;
@@ -1975,21 +2040,21 @@ void opj_pi_create_encode(opj_pi_iterator_t *pi,
                                         resetX = 0;
                                     }
                                 } else {
-                                    pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t;
-                                    pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy));
+                                    pi[pino].poc.ty0 = tcp->ty0_t;
+                                    pi[pino].poc.ty1 = tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy);
                                     tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1;
                                     incr_top = 0;
                                     resetX = 1;
                                 }
                                 if (resetX == 1) {
                                     tcp->tx0_t = tcp->txS;
-                                    pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t;
-                                    pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx));
+                                    pi[pino].poc.tx0 = tcp->tx0_t;
+                                    pi[pino].poc.tx1 = tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx);
                                     tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1;
                                 }
                             } else {
-                                pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t;
-                                pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx));
+                                pi[pino].poc.tx0 = tcp->tx0_t;
+                                pi[pino].poc.tx1 = tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx);
                                 tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1;
                                 incr_top = 0;
                             }
@@ -2042,7 +2107,7 @@ void opj_pi_update_encoding_parameters(const opj_image_t *p_image,
     /* encoding parameters to set */
     OPJ_UINT32 l_max_res;
     OPJ_UINT32 l_max_prec;
-    OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1;
+    OPJ_UINT32 l_tx0, l_tx1, l_ty0, l_ty1;
     OPJ_UINT32 l_dx_min, l_dy_min;
 
     /* pointers */
diff --git a/3rdparty/openjpeg/openjp2/pi.h b/3rdparty/openjpeg/openjp2/pi.h
index 8c0dc25c19..0320523b76 100644
--- a/3rdparty/openjpeg/openjp2/pi.h
+++ b/3rdparty/openjpeg/openjp2/pi.h
@@ -102,11 +102,13 @@ typedef struct opj_pi_iterator {
     /** Components*/
     opj_pi_comp_t *comps;
     /** FIXME DOC*/
-    OPJ_INT32 tx0, ty0, tx1, ty1;
+    OPJ_UINT32 tx0, ty0, tx1, ty1;
     /** FIXME DOC*/
-    OPJ_INT32 x, y;
+    OPJ_UINT32 x, y;
     /** FIXME DOC*/
     OPJ_UINT32 dx, dy;
+    /** event manager */
+    opj_event_mgr_t* manager;
 } opj_pi_iterator_t;
 
 /** @name Exported functions */
@@ -119,13 +121,15 @@ typedef struct opj_pi_iterator {
  * @param   cp      the coding parameters.
  * @param   tileno  index of the tile being encoded.
  * @param   t2_mode the type of pass for generating the packet iterator
+ * @param   manager Event manager
  *
  * @return  a list of packet iterator that points to the first packet of the tile (not true).
 */
 opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *image,
         opj_cp_t *cp,
         OPJ_UINT32 tileno,
-        J2K_T2_MODE t2_mode);
+        J2K_T2_MODE t2_mode,
+        opj_event_mgr_t* manager);
 
 /**
  * Updates the encoding parameters of the codec.
@@ -161,12 +165,14 @@ Create a packet iterator for Decoder
 @param image Raw image for which the packets will be listed
 @param cp Coding parameters
 @param tileno Number that identifies the tile for which to list the packets
+@param manager Event manager
 @return Returns a packet iterator that points to the first packet of the tile
 @see opj_pi_destroy
 */
 opj_pi_iterator_t *opj_pi_create_decode(opj_image_t * image,
                                         opj_cp_t * cp,
-                                        OPJ_UINT32 tileno);
+                                        OPJ_UINT32 tileno,
+                                        opj_event_mgr_t* manager);
 /**
  * Destroys a packet iterator array.
  *
@@ -182,6 +188,17 @@ Modify the packet iterator to point to the next packet
 @return Returns false if pi pointed to the last packet or else returns true
 */
 OPJ_BOOL opj_pi_next(opj_pi_iterator_t * pi);
+
+/**
+ * Return the number of packets in the tile.
+ * @param   image       the image being encoded.
+ * @param cp Coding parameters
+ * @param tileno Number that identifies the tile.
+ */
+OPJ_UINT32 opj_get_encoding_packet_count(const opj_image_t *p_image,
+        const opj_cp_t *p_cp,
+        OPJ_UINT32 p_tile_no);
+
 /* ----------------------------------------------------------------------- */
 /*@}*/
 
diff --git a/3rdparty/openjpeg/openjp2/t1.c b/3rdparty/openjpeg/openjp2/t1.c
index f6f7671190..1bea54b0d5 100644
--- a/3rdparty/openjpeg/openjp2/t1.c
+++ b/3rdparty/openjpeg/openjp2/t1.c
@@ -61,6 +61,13 @@
 
 #define opj_t1_setcurctx(curctx, ctxno)  curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)]
 
+/* Macros to deal with signed integer with just MSB bit set for
+ * negative values (smr = signed magnitude representation) */
+#define opj_smr_abs(x)  (((OPJ_UINT32)(x)) & 0x7FFFFFFFU)
+#define opj_smr_sign(x) (((OPJ_UINT32)(x)) >> 31)
+#define opj_to_smr(x)   ((x) >= 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) | 0x80000000U))
+
+
 /** @name Local static functions */
 /*@{*/
 
@@ -177,18 +184,18 @@ static OPJ_FLOAT64 opj_t1_getwmsedec(
     const OPJ_FLOAT64 * mct_norms,
     OPJ_UINT32 mct_numcomps);
 
-static void opj_t1_encode_cblk(opj_t1_t *t1,
-                               opj_tcd_cblk_enc_t* cblk,
-                               OPJ_UINT32 orient,
-                               OPJ_UINT32 compno,
-                               OPJ_UINT32 level,
-                               OPJ_UINT32 qmfbid,
-                               OPJ_FLOAT64 stepsize,
-                               OPJ_UINT32 cblksty,
-                               OPJ_UINT32 numcomps,
-                               opj_tcd_tile_t * tile,
-                               const OPJ_FLOAT64 * mct_norms,
-                               OPJ_UINT32 mct_numcomps);
+/** Return "cumwmsedec" that should be used to increase tile->distotile */
+static double opj_t1_encode_cblk(opj_t1_t *t1,
+                                 opj_tcd_cblk_enc_t* cblk,
+                                 OPJ_UINT32 orient,
+                                 OPJ_UINT32 compno,
+                                 OPJ_UINT32 level,
+                                 OPJ_UINT32 qmfbid,
+                                 OPJ_FLOAT64 stepsize,
+                                 OPJ_UINT32 cblksty,
+                                 OPJ_UINT32 numcomps,
+                                 const OPJ_FLOAT64 * mct_norms,
+                                 OPJ_UINT32 mct_numcomps);
 
 /**
 Decode 1 code-block
@@ -329,61 +336,53 @@ static INLINE void opj_t1_update_flags(opj_flag_t *flagsp, OPJ_UINT32 ci,
 /**
 Encode significant pass
 */
-static INLINE void opj_t1_enc_sigpass_step(opj_t1_t *t1,
-        opj_flag_t *flagsp,
-        OPJ_INT32 *datap,
-        OPJ_INT32 bpno,
-        OPJ_INT32 one,
-        OPJ_INT32 *nmsedec,
-        OPJ_BYTE type,
-        OPJ_UINT32 ci,
-        OPJ_UINT32 vsc)
-{
-    OPJ_UINT32 v;
-
-    opj_mqc_t *mqc = &(t1->mqc);   /* MQC component */
-
-    OPJ_UINT32 const flags = *flagsp;
-
-    if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U &&
-            (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) {
-        OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U));
-        v = (opj_int_abs(*datap) & one) ? 1 : 0;
-#ifdef DEBUG_ENC_SIG
-        fprintf(stderr, "   ctxt1=%d\n", ctxt1);
-#endif
-        opj_mqc_setcurctx(mqc, ctxt1);
-        if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */
-            opj_mqc_bypass_enc(mqc, v);
-        } else {
-            opj_mqc_encode(mqc, v);
-        }
-        if (v) {
-            OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index(
-                                *flagsp,
-                                flagsp[-1], flagsp[1],
-                                ci);
-            OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu);
-            v = *datap < 0 ? 1U : 0U;
-            *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap),
-                                              (OPJ_UINT32)bpno);
-#ifdef DEBUG_ENC_SIG
-            fprintf(stderr, "   ctxt2=%d\n", ctxt2);
-#endif
-            opj_mqc_setcurctx(mqc, ctxt2);
-            if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */
-                opj_mqc_bypass_enc(mqc, v);
-            } else {
-                OPJ_UINT32 spb = opj_t1_getspb(lu);
-#ifdef DEBUG_ENC_SIG
-                fprintf(stderr, "   spb=%d\n", spb);
-#endif
-                opj_mqc_encode(mqc, v ^ spb);
-            }
-            opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc);
-        }
-        *flagsp |= T1_PI_THIS << (ci * 3U);
-    }
+#define opj_t1_enc_sigpass_step_macro(mqc, curctx, a, c, ct, flagspIn, datapIn, bpno, one, nmsedec, type, ciIn, vscIn) \
+{ \
+    OPJ_UINT32 v; \
+    const OPJ_UINT32 ci = (ciIn); \
+    const OPJ_UINT32 vsc = (vscIn); \
+    const OPJ_INT32* l_datap = (datapIn); \
+    opj_flag_t* flagsp = (flagspIn); \
+    OPJ_UINT32 const flags = *flagsp; \
+    if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && \
+            (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { \
+        OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); \
+        v = (opj_smr_abs(*l_datap) & (OPJ_UINT32)one) ? 1 : 0; \
+/* #ifdef DEBUG_ENC_SIG */ \
+/*        fprintf(stderr, "   ctxt1=%d\n", ctxt1); */ \
+/* #endif */ \
+        opj_t1_setcurctx(curctx, ctxt1); \
+        if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */ \
+            opj_mqc_bypass_enc_macro(mqc, c, ct, v); \
+        } else { \
+            opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \
+        } \
+        if (v) { \
+            OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \
+                                *flagsp, \
+                                flagsp[-1], flagsp[1], \
+                                ci); \
+            OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu); \
+            v = opj_smr_sign(*l_datap); \
+            *nmsedec += opj_t1_getnmsedec_sig(opj_smr_abs(*l_datap), \
+                                              (OPJ_UINT32)bpno); \
+/* #ifdef DEBUG_ENC_SIG */ \
+/*            fprintf(stderr, "   ctxt2=%d\n", ctxt2); */ \
+/* #endif */ \
+            opj_t1_setcurctx(curctx, ctxt2); \
+            if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */ \
+                opj_mqc_bypass_enc_macro(mqc, c, ct, v); \
+            } else { \
+                OPJ_UINT32 spb = opj_t1_getspb(lu); \
+/* #ifdef DEBUG_ENC_SIG */ \
+/*                fprintf(stderr, "   spb=%d\n", spb); */ \
+/* #endif */ \
+                opj_mqc_encode_macro(mqc, curctx, a, c, ct, v ^ spb); \
+            } \
+            opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc); \
+        } \
+        *flagsp |= T1_PI_THIS << (ci * 3U); \
+    } \
 }
 
 static INLINE void opj_t1_dec_sigpass_step_raw(
@@ -464,63 +463,64 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1,
     OPJ_INT32 const one = 1 << (bpno + T1_NMSEDEC_FRACBITS);
     opj_flag_t* f = &T1_FLAGS(0, 0);
     OPJ_UINT32 const extra = 2;
+    opj_mqc_t* mqc = &(t1->mqc);
+    DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
+    const OPJ_INT32* datap = t1->data;
 
     *nmsedec = 0;
 #ifdef DEBUG_ENC_SIG
     fprintf(stderr, "enc_sigpass: bpno=%d\n", bpno);
 #endif
-    for (k = 0; k < (t1->h & ~3U); k += 4) {
+    for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) {
+        const OPJ_UINT32 w = t1->w;
 #ifdef DEBUG_ENC_SIG
         fprintf(stderr, " k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < w; ++i, ++f, datap += 4) {
 #ifdef DEBUG_ENC_SIG
             fprintf(stderr, " i=%d\n", i);
 #endif
             if (*f == 0U) {
                 /* Nothing to do for any of the 4 data points */
-                f++;
                 continue;
             }
-            opj_t1_enc_sigpass_step(
-                t1,
+            opj_t1_enc_sigpass_step_macro(
+                mqc, curctx, a, c, ct,
                 f,
-                &t1->data[((k + 0) * t1->data_stride) + i],
+                &datap[0],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 0, cblksty & J2K_CCP_CBLKSTY_VSC);
-            opj_t1_enc_sigpass_step(
-                t1,
+            opj_t1_enc_sigpass_step_macro(
+                mqc, curctx, a, c, ct,
                 f,
-                &t1->data[((k + 1) * t1->data_stride) + i],
+                &datap[1],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 1, 0);
-            opj_t1_enc_sigpass_step(
-                t1,
+            opj_t1_enc_sigpass_step_macro(
+                mqc, curctx, a, c, ct,
                 f,
-                &t1->data[((k + 2) * t1->data_stride) + i],
+                &datap[2],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 2, 0);
-            opj_t1_enc_sigpass_step(
-                t1,
+            opj_t1_enc_sigpass_step_macro(
+                mqc, curctx, a, c, ct,
                 f,
-                &t1->data[((k + 3) * t1->data_stride) + i],
+                &datap[3],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 3, 0);
-            ++f;
         }
-        f += extra;
     }
 
     if (k < t1->h) {
@@ -528,20 +528,20 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1,
 #ifdef DEBUG_ENC_SIG
         fprintf(stderr, " k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < t1->w; ++i, ++f) {
 #ifdef DEBUG_ENC_SIG
             fprintf(stderr, " i=%d\n", i);
 #endif
             if (*f == 0U) {
                 /* Nothing to do for any of the 4 data points */
-                f++;
+                datap += (t1->h - k);
                 continue;
             }
-            for (j = k; j < t1->h; ++j) {
-                opj_t1_enc_sigpass_step(
-                    t1,
+            for (j = k; j < t1->h; ++j, ++datap) {
+                opj_t1_enc_sigpass_step_macro(
+                    mqc, curctx, a, c, ct,
                     f,
-                    &t1->data[(j * t1->data_stride) + i],
+                    &datap[0],
                     bpno,
                     one,
                     nmsedec,
@@ -549,9 +549,10 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1,
                     j - k,
                     (j == k && (cblksty & J2K_CCP_CBLKSTY_VSC) != 0));
             }
-            ++f;
         }
     }
+
+    UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
 }
 
 static void opj_t1_dec_sigpass_raw(
@@ -626,7 +627,7 @@ static void opj_t1_dec_sigpass_raw(
         register opj_flag_t *flagsp = &t1->flags[(flags_stride) + 1]; \
         const OPJ_UINT32 l_w = w; \
         opj_mqc_t* mqc = &(t1->mqc); \
-        DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+        DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
         register OPJ_UINT32 v; \
         one = 1 << bpno; \
         half = one >> 1; \
@@ -651,7 +652,7 @@ static void opj_t1_dec_sigpass_raw(
                         } \
                 } \
         } \
-        UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+        UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
         if( k < h ) { \
             for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \
                 for (j = 0; j < h - k; ++j) { \
@@ -715,38 +716,27 @@ static void opj_t1_dec_sigpass_mqc(
 /**
 Encode refinement pass step
 */
-static INLINE void opj_t1_enc_refpass_step(opj_t1_t *t1,
-        opj_flag_t *flagsp,
-        OPJ_INT32 *datap,
-        OPJ_INT32 bpno,
-        OPJ_INT32 one,
-        OPJ_INT32 *nmsedec,
-        OPJ_BYTE type,
-        OPJ_UINT32 ci)
-{
-    OPJ_UINT32 v;
-
-    opj_mqc_t *mqc = &(t1->mqc);   /* MQC component */
-
-    OPJ_UINT32 const shift_flags =
-        (*flagsp >> (ci * 3U));
-
-    if ((shift_flags & (T1_SIGMA_THIS | T1_PI_THIS)) == T1_SIGMA_THIS) {
-        OPJ_UINT32 ctxt = opj_t1_getctxno_mag(shift_flags);
-        *nmsedec += opj_t1_getnmsedec_ref((OPJ_UINT32)opj_int_abs(*datap),
-                                          (OPJ_UINT32)bpno);
-        v = (opj_int_abs(*datap) & one) ? 1 : 0;
-#ifdef DEBUG_ENC_REF
-        fprintf(stderr, "  ctxt=%d\n", ctxt);
-#endif
-        opj_mqc_setcurctx(mqc, ctxt);
-        if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */
-            opj_mqc_bypass_enc(mqc, v);
-        } else {
-            opj_mqc_encode(mqc, v);
-        }
-        *flagsp |= T1_MU_THIS << (ci * 3U);
-    }
+#define opj_t1_enc_refpass_step_macro(mqc, curctx, a, c, ct, flags, flagsUpdated, datap, bpno, one, nmsedec, type, ci) \
+{\
+    OPJ_UINT32 v; \
+    if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << ((ci) * 3U))) == (T1_SIGMA_THIS << ((ci) * 3U))) { \
+        const OPJ_UINT32 shift_flags = (flags >> ((ci) * 3U)); \
+        OPJ_UINT32 ctxt = opj_t1_getctxno_mag(shift_flags); \
+        OPJ_UINT32 abs_data = opj_smr_abs(*datap); \
+        *nmsedec += opj_t1_getnmsedec_ref(abs_data, \
+                                          (OPJ_UINT32)bpno); \
+        v = ((OPJ_INT32)abs_data & one) ? 1 : 0; \
+/* #ifdef DEBUG_ENC_REF */ \
+/*        fprintf(stderr, "  ctxt=%d\n", ctxt); */ \
+/* #endif */ \
+        opj_t1_setcurctx(curctx, ctxt); \
+        if (type == T1_TYPE_RAW) {  /* BYPASS/LAZY MODE */ \
+            opj_mqc_bypass_enc_macro(mqc, c, ct, v); \
+        } else { \
+            opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \
+        } \
+        flagsUpdated |= T1_MU_THIS << ((ci) * 3U); \
+    } \
 }
 
 
@@ -807,100 +797,104 @@ static void opj_t1_enc_refpass(
     const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS);
     opj_flag_t* f = &T1_FLAGS(0, 0);
     const OPJ_UINT32 extra = 2U;
+    opj_mqc_t* mqc = &(t1->mqc);
+    DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
+    const OPJ_INT32* datap = t1->data;
 
     *nmsedec = 0;
 #ifdef DEBUG_ENC_REF
     fprintf(stderr, "enc_refpass: bpno=%d\n", bpno);
 #endif
-    for (k = 0; k < (t1->h & ~3U); k += 4) {
+    for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) {
 #ifdef DEBUG_ENC_REF
         fprintf(stderr, " k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < t1->w; ++i, f++, datap += 4) {
+            const OPJ_UINT32 flags = *f;
+            OPJ_UINT32 flagsUpdated = flags;
 #ifdef DEBUG_ENC_REF
             fprintf(stderr, " i=%d\n", i);
 #endif
-            if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) {
+            if ((flags & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) {
                 /* none significant */
-                f++;
                 continue;
             }
-            if ((*f & (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) ==
+            if ((flags & (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) ==
                     (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) {
                 /* all processed by sigpass */
-                f++;
                 continue;
             }
 
-            opj_t1_enc_refpass_step(
-                t1,
-                f,
-                &t1->data[((k + 0) * t1->data_stride) + i],
+            opj_t1_enc_refpass_step_macro(
+                mqc, curctx, a, c, ct,
+                flags, flagsUpdated,
+                &datap[0],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 0);
-            opj_t1_enc_refpass_step(
-                t1,
-                f,
-                &t1->data[((k + 1) * t1->data_stride) + i],
+            opj_t1_enc_refpass_step_macro(
+                mqc, curctx, a, c, ct,
+                flags, flagsUpdated,
+                &datap[1],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 1);
-            opj_t1_enc_refpass_step(
-                t1,
-                f,
-                &t1->data[((k + 2) * t1->data_stride) + i],
+            opj_t1_enc_refpass_step_macro(
+                mqc, curctx, a, c, ct,
+                flags, flagsUpdated,
+                &datap[2],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 2);
-            opj_t1_enc_refpass_step(
-                t1,
-                f,
-                &t1->data[((k + 3) * t1->data_stride) + i],
+            opj_t1_enc_refpass_step_macro(
+                mqc, curctx, a, c, ct,
+                flags, flagsUpdated,
+                &datap[3],
                 bpno,
                 one,
                 nmsedec,
                 type,
                 3);
-            ++f;
+            *f = flagsUpdated;
         }
-        f += extra;
     }
 
     if (k < t1->h) {
         OPJ_UINT32 j;
+        const OPJ_UINT32 remaining_lines = t1->h - k;
 #ifdef DEBUG_ENC_REF
         fprintf(stderr, " k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < t1->w; ++i, ++f) {
 #ifdef DEBUG_ENC_REF
             fprintf(stderr, " i=%d\n", i);
 #endif
             if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) {
                 /* none significant */
-                f++;
+                datap += remaining_lines;
                 continue;
             }
-            for (j = k; j < t1->h; ++j) {
-                opj_t1_enc_refpass_step(
-                    t1,
-                    f,
-                    &t1->data[(j * t1->data_stride) + i],
+            for (j = 0; j < remaining_lines; ++j, datap ++) {
+                opj_t1_enc_refpass_step_macro(
+                    mqc, curctx, a, c, ct,
+                    *f, *f,
+                    &datap[0],
                     bpno,
                     one,
                     nmsedec,
                     type,
-                    j - k);
+                    j);
             }
-            ++f;
         }
     }
+
+    UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
 }
 
 
@@ -968,7 +962,7 @@ static void opj_t1_dec_refpass_raw(
         register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \
         const OPJ_UINT32 l_w = w; \
         opj_mqc_t* mqc = &(t1->mqc); \
-        DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+        DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
         register OPJ_UINT32 v; \
         one = 1 << bpno; \
         poshalf = one >> 1; \
@@ -992,7 +986,7 @@ static void opj_t1_dec_refpass_raw(
                         } \
                 } \
         } \
-        UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+        UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
         if( k < h ) { \
             for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \
                 for (j = 0; j < h - k; ++j) { \
@@ -1030,86 +1024,71 @@ static void opj_t1_dec_refpass_mqc(
 /**
 Encode clean-up pass step
 */
-static void opj_t1_enc_clnpass_step(
-    opj_t1_t *t1,
-    opj_flag_t *flagsp,
-    OPJ_INT32 *datap,
-    OPJ_INT32 bpno,
-    OPJ_INT32 one,
-    OPJ_INT32 *nmsedec,
-    OPJ_UINT32 agg,
-    OPJ_UINT32 runlen,
-    OPJ_UINT32 lim,
-    OPJ_UINT32 cblksty)
-{
-    OPJ_UINT32 v;
-    OPJ_UINT32 ci;
-    opj_mqc_t *mqc = &(t1->mqc);   /* MQC component */
-
-    const OPJ_UINT32 check = (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13 |
-                              T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3);
-
-    if ((*flagsp & check) == check) {
-        if (runlen == 0) {
-            *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3);
-        } else if (runlen == 1) {
-            *flagsp &= ~(T1_PI_1 | T1_PI_2 | T1_PI_3);
-        } else if (runlen == 2) {
-            *flagsp &= ~(T1_PI_2 | T1_PI_3);
-        } else if (runlen == 3) {
-            *flagsp &= ~(T1_PI_3);
-        }
-        return;
-    }
-
-    for (ci = runlen; ci < lim; ++ci) {
-        OPJ_UINT32 vsc;
-        opj_flag_t flags;
-        OPJ_UINT32 ctxt1;
-
-        flags = *flagsp;
-
-        if ((agg != 0) && (ci == runlen)) {
-            goto LABEL_PARTIAL;
-        }
-
-        if (!(flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) {
-            ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U));
-#ifdef DEBUG_ENC_CLN
-            printf("   ctxt1=%d\n", ctxt1);
-#endif
-            opj_mqc_setcurctx(mqc, ctxt1);
-            v = (opj_int_abs(*datap) & one) ? 1 : 0;
-            opj_mqc_encode(mqc, v);
-            if (v) {
-                OPJ_UINT32 ctxt2, spb;
-                OPJ_UINT32 lu;
-LABEL_PARTIAL:
-                lu = opj_t1_getctxtno_sc_or_spb_index(
-                         *flagsp,
-                         flagsp[-1], flagsp[1],
-                         ci);
-                *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap),
-                                                  (OPJ_UINT32)bpno);
-                ctxt2 = opj_t1_getctxno_sc(lu);
-#ifdef DEBUG_ENC_CLN
-                printf("   ctxt2=%d\n", ctxt2);
-#endif
-                opj_mqc_setcurctx(mqc, ctxt2);
-
-                v = *datap < 0 ? 1U : 0U;
-                spb = opj_t1_getspb(lu);
-#ifdef DEBUG_ENC_CLN
-                printf("   spb=%d\n", spb);
-#endif
-                opj_mqc_encode(mqc, v ^ spb);
-                vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (ci == 0)) ? 1 : 0;
-                opj_t1_update_flags(flagsp, ci, v, t1->w + 2U, vsc);
-            }
-        }
-        *flagsp &= ~(T1_PI_THIS << (3U * ci));
-        datap += t1->data_stride;
-    }
+#define opj_t1_enc_clnpass_step_macro(mqc, curctx, a, c, ct, flagspIn, datapIn, bpno, one, nmsedec, agg, runlen, lim, cblksty) \
+{ \
+    OPJ_UINT32 v; \
+    OPJ_UINT32 ci; \
+    opj_flag_t* const flagsp = (flagspIn); \
+    const OPJ_INT32* l_datap = (datapIn); \
+    const OPJ_UINT32 check = (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13 | \
+                              T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \
+ \
+    if ((*flagsp & check) == check) { \
+        if (runlen == 0) { \
+            *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \
+        } else if (runlen == 1) { \
+            *flagsp &= ~(T1_PI_1 | T1_PI_2 | T1_PI_3); \
+        } else if (runlen == 2) { \
+            *flagsp &= ~(T1_PI_2 | T1_PI_3); \
+        } else if (runlen == 3) { \
+            *flagsp &= ~(T1_PI_3); \
+        } \
+    } \
+    else \
+    for (ci = runlen; ci < lim; ++ci) { \
+        OPJ_BOOL goto_PARTIAL = OPJ_FALSE; \
+        if ((agg != 0) && (ci == runlen)) { \
+            goto_PARTIAL = OPJ_TRUE; \
+        } \
+        else if (!(*flagsp & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) { \
+            OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, *flagsp >> (ci * 3U)); \
+/* #ifdef DEBUG_ENC_CLN */ \
+/*            printf("   ctxt1=%d\n", ctxt1); */ \
+/* #endif */ \
+            opj_t1_setcurctx(curctx, ctxt1); \
+            v = (opj_smr_abs(*l_datap) & (OPJ_UINT32)one) ? 1 : 0; \
+            opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \
+            if (v) { \
+                goto_PARTIAL = OPJ_TRUE; \
+            } \
+        } \
+        if( goto_PARTIAL ) { \
+            OPJ_UINT32 vsc; \
+            OPJ_UINT32 ctxt2, spb; \
+            OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \
+                        *flagsp, \
+                        flagsp[-1], flagsp[1], \
+                        ci); \
+            *nmsedec += opj_t1_getnmsedec_sig(opj_smr_abs(*l_datap), \
+                                                (OPJ_UINT32)bpno); \
+            ctxt2 = opj_t1_getctxno_sc(lu); \
+/* #ifdef DEBUG_ENC_CLN */ \
+/*           printf("   ctxt2=%d\n", ctxt2); */ \
+/* #endif */ \
+            opj_t1_setcurctx(curctx, ctxt2); \
+ \
+            v = opj_smr_sign(*l_datap); \
+            spb = opj_t1_getspb(lu); \
+/* #ifdef DEBUG_ENC_CLN */ \
+/*           printf("   spb=%d\n", spb); */\
+/* #endif */ \
+            opj_mqc_encode_macro(mqc, curctx, a, c, ct, v ^ spb); \
+            vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (ci == 0)) ? 1 : 0; \
+            opj_t1_update_flags(flagsp, ci, v, t1->w + 2U, vsc); \
+        } \
+        *flagsp &= ~(T1_PI_THIS << (3U * ci)); \
+        l_datap ++; \
+    } \
 }
 
 #define opj_t1_dec_clnpass_step_macro(check_flags, partial, \
@@ -1165,47 +1144,50 @@ static void opj_t1_enc_clnpass(
 {
     OPJ_UINT32 i, k;
     const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS);
-    OPJ_UINT32 agg, runlen;
-
-    opj_mqc_t *mqc = &(t1->mqc);   /* MQC component */
+    opj_mqc_t* mqc = &(t1->mqc);
+    DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
+    const OPJ_INT32* datap = t1->data;
+    opj_flag_t *f = &T1_FLAGS(0, 0);
+    const OPJ_UINT32 extra = 2U;
 
     *nmsedec = 0;
 #ifdef DEBUG_ENC_CLN
     printf("enc_clnpass: bpno=%d\n", bpno);
 #endif
-    for (k = 0; k < (t1->h & ~3U); k += 4) {
+    for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) {
 #ifdef DEBUG_ENC_CLN
         printf(" k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < t1->w; ++i, f++) {
+            OPJ_UINT32 agg, runlen;
 #ifdef DEBUG_ENC_CLN
             printf("  i=%d\n", i);
 #endif
-            agg = !(T1_FLAGS(i, k));
+            agg = !*f;
 #ifdef DEBUG_ENC_CLN
             printf("   agg=%d\n", agg);
 #endif
             if (agg) {
-                for (runlen = 0; runlen < 4; ++runlen) {
-                    if (opj_int_abs(t1->data[((k + runlen)*t1->data_stride) + i]) & one) {
+                for (runlen = 0; runlen < 4; ++runlen, ++datap) {
+                    if (opj_smr_abs(*datap) & (OPJ_UINT32)one) {
                         break;
                     }
                 }
-                opj_mqc_setcurctx(mqc, T1_CTXNO_AGG);
-                opj_mqc_encode(mqc, runlen != 4);
+                opj_t1_setcurctx(curctx, T1_CTXNO_AGG);
+                opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen != 4);
                 if (runlen == 4) {
                     continue;
                 }
-                opj_mqc_setcurctx(mqc, T1_CTXNO_UNI);
-                opj_mqc_encode(mqc, runlen >> 1);
-                opj_mqc_encode(mqc, runlen & 1);
+                opj_t1_setcurctx(curctx, T1_CTXNO_UNI);
+                opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen >> 1);
+                opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen & 1);
             } else {
                 runlen = 0;
             }
-            opj_t1_enc_clnpass_step(
-                t1,
-                &T1_FLAGS(i, k),
-                &t1->data[((k + runlen) * t1->data_stride) + i],
+            opj_t1_enc_clnpass_step_macro(
+                mqc, curctx, a, c, ct,
+                f,
+                datap,
                 bpno,
                 one,
                 nmsedec,
@@ -1213,23 +1195,24 @@ static void opj_t1_enc_clnpass(
                 runlen,
                 4U,
                 cblksty);
+            datap += 4 - runlen;
         }
     }
     if (k < t1->h) {
-        agg = 0;
-        runlen = 0;
+        const OPJ_UINT32 agg = 0;
+        const OPJ_UINT32 runlen = 0;
 #ifdef DEBUG_ENC_CLN
         printf(" k=%d\n", k);
 #endif
-        for (i = 0; i < t1->w; ++i) {
+        for (i = 0; i < t1->w; ++i, f++) {
 #ifdef DEBUG_ENC_CLN
             printf("  i=%d\n", i);
             printf("   agg=%d\n", agg);
 #endif
-            opj_t1_enc_clnpass_step(
-                t1,
-                &T1_FLAGS(i, k),
-                &t1->data[((k + runlen) * t1->data_stride) + i],
+            opj_t1_enc_clnpass_step_macro(
+                mqc, curctx, a, c, ct,
+                f,
+                datap,
                 bpno,
                 one,
                 nmsedec,
@@ -1237,8 +1220,11 @@ static void opj_t1_enc_clnpass(
                 runlen,
                 t1->h - k,
                 cblksty);
+            datap += t1->h - k;
         }
     }
+
+    UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct);
 }
 
 #define opj_t1_dec_clnpass_internal(t1, bpno, vsc, w, h, flags_stride) \
@@ -1250,7 +1236,7 @@ static void opj_t1_enc_clnpass(
     opj_mqc_t* mqc = &(t1->mqc); \
     register OPJ_INT32 *data = t1->data; \
     register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \
-    DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+    DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
     register OPJ_UINT32 v; \
     one = 1 << bpno; \
     half = one >> 1; \
@@ -1319,7 +1305,7 @@ static void opj_t1_enc_clnpass(
             *flagsp = flags & ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \
         } \
     } \
-    UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \
+    UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \
     if( k < h ) { \
         for (i = 0; i < l_w; ++i, ++flagsp, ++data) { \
             for (j = 0; j < h - k; ++j) { \
@@ -1426,7 +1412,11 @@ static OPJ_FLOAT64 opj_t1_getwmsedec(
     if (qmfbid == 1) {
         w2 = opj_dwt_getnorm(level, orient);
     } else {    /* if (qmfbid == 0) */
+        const OPJ_INT32 log2_gain = (orient == 0) ? 0 :
+                                    (orient == 3) ? 2 : 1;
         w2 = opj_dwt_getnorm_real(level, orient);
+        /* Not sure this is right. But preserves past behaviour */
+        stepsize /= (1 << log2_gain);
     }
 
     wmsedec = w1 * w2 * stepsize * (1 << bpno);
@@ -1450,7 +1440,7 @@ static OPJ_BOOL opj_t1_allocate_buffers(
     assert(w * h <= 4096);
 
     /* encoder uses tile buffer, so no need to allocate */
-    if (!t1->encoder) {
+    {
         OPJ_UINT32 datasize = w * h;
 
         if (datasize > t1->datasize) {
@@ -1560,8 +1550,7 @@ void opj_t1_destroy(opj_t1_t *p_t1)
         return;
     }
 
-    /* encoder uses tile buffer, so no need to free */
-    if (!p_t1->encoder && p_t1->data) {
+    if (p_t1->data) {
         opj_aligned_free(p_t1->data);
         p_t1->data = 00;
     }
@@ -1658,7 +1647,21 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
     t1 = (opj_t1_t*) opj_tls_get(tls, OPJ_TLS_KEY_T1);
     if (t1 == NULL) {
         t1 = opj_t1_create(OPJ_FALSE);
-        opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper);
+        if (t1 == NULL) {
+            opj_event_msg(job->p_manager, EVT_ERROR,
+                          "Cannot allocate Tier 1 handle\n");
+            *(job->pret) = OPJ_FALSE;
+            opj_free(job);
+            return;
+        }
+        if (!opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper)) {
+            opj_event_msg(job->p_manager, EVT_ERROR,
+                          "Unable to set t1 handle as TLS\n");
+            opj_t1_destroy(t1);
+            *(job->pret) = OPJ_FALSE;
+            opj_free(job);
+            return;
+        }
     }
     t1->mustuse_cblkdatabuffer = job->mustuse_cblkdatabuffer;
 
@@ -1725,10 +1728,11 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
                 datap[i] /= 2;
             }
         } else {        /* if (tccp->qmfbid == 0) */
+            const float stepsize = 0.5f * band->stepsize;
             i = 0;
 #ifdef __SSE2__
             {
-                const __m128 xmm_stepsize = _mm_set1_ps(band->stepsize);
+                const __m128 xmm_stepsize = _mm_set1_ps(stepsize);
                 for (; i < (cblk_size & ~15U); i += 16) {
                     __m128 xmm0_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)(
                                                            datap + 0)));
@@ -1747,7 +1751,7 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
             }
 #endif
             for (; i < cblk_size; ++i) {
-                OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * band->stepsize;
+                OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * stepsize;
                 memcpy(datap, &tmp, sizeof(tmp));
                 datap++;
             }
@@ -1773,12 +1777,13 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
             }
         }
     } else {        /* if (tccp->qmfbid == 0) */
+        const float stepsize = 0.5f * band->stepsize;
         OPJ_FLOAT32* OPJ_RESTRICT tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_SIZE_T)y *
                                                          tile_w + (OPJ_SIZE_T)x];
         for (j = 0; j < cblk_h; ++j) {
             OPJ_FLOAT32* OPJ_RESTRICT tiledp2 = tiledp;
             for (i = 0; i < cblk_w; ++i) {
-                OPJ_FLOAT32 tmp = (OPJ_FLOAT32) * datap * band->stepsize;
+                OPJ_FLOAT32 tmp = (OPJ_FLOAT32) * datap * stepsize;
                 *tiledp2 = tmp;
                 datap++;
                 tiledp2++;
@@ -2100,124 +2105,232 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1,
 }
 
 
+typedef struct {
+    OPJ_UINT32 compno;
+    OPJ_UINT32 resno;
+    opj_tcd_cblk_enc_t* cblk;
+    opj_tcd_tile_t *tile;
+    opj_tcd_band_t* band;
+    opj_tcd_tilecomp_t* tilec;
+    opj_tccp_t* tccp;
+    const OPJ_FLOAT64 * mct_norms;
+    OPJ_UINT32 mct_numcomps;
+    volatile OPJ_BOOL* pret;
+    opj_mutex_t* mutex;
+} opj_t1_cblk_encode_processing_job_t;
+
+/** Procedure to deal with a asynchronous code-block encoding job.
+ *
+ * @param user_data Pointer to a opj_t1_cblk_encode_processing_job_t* structure
+ * @param tls       TLS handle.
+ */
+static void opj_t1_cblk_encode_processor(void* user_data, opj_tls_t* tls)
+{
+    opj_t1_cblk_encode_processing_job_t* job =
+        (opj_t1_cblk_encode_processing_job_t*)user_data;
+    opj_tcd_cblk_enc_t* cblk = job->cblk;
+    const opj_tcd_band_t* band = job->band;
+    const opj_tcd_tilecomp_t* tilec = job->tilec;
+    const opj_tccp_t* tccp = job->tccp;
+    const OPJ_UINT32 resno = job->resno;
+    opj_t1_t* t1;
+    const OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
+
+    OPJ_INT32* OPJ_RESTRICT tiledp;
+    OPJ_UINT32 cblk_w;
+    OPJ_UINT32 cblk_h;
+    OPJ_UINT32 i, j;
+
+    OPJ_INT32 x = cblk->x0 - band->x0;
+    OPJ_INT32 y = cblk->y0 - band->y0;
+
+    if (!*(job->pret)) {
+        opj_free(job);
+        return;
+    }
+
+    t1 = (opj_t1_t*) opj_tls_get(tls, OPJ_TLS_KEY_T1);
+    if (t1 == NULL) {
+        t1 = opj_t1_create(OPJ_TRUE); /* OPJ_TRUE == T1 for encoding */
+        opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper);
+    }
+
+    if (band->bandno & 1) {
+        opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1];
+        x += pres->x1 - pres->x0;
+    }
+    if (band->bandno & 2) {
+        opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1];
+        y += pres->y1 - pres->y0;
+    }
+
+    if (!opj_t1_allocate_buffers(
+                t1,
+                (OPJ_UINT32)(cblk->x1 - cblk->x0),
+                (OPJ_UINT32)(cblk->y1 - cblk->y0))) {
+        *(job->pret) = OPJ_FALSE;
+        opj_free(job);
+        return;
+    }
+
+    cblk_w = t1->w;
+    cblk_h = t1->h;
+
+    tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x];
+
+    if (tccp->qmfbid == 1) {
+        /* Do multiplication on unsigned type, even if the
+            * underlying type is signed, to avoid potential
+            * int overflow on large value (the output will be
+            * incorrect in such situation, but whatever...)
+            * This assumes complement-to-2 signed integer
+            * representation
+            * Fixes https://github.com/uclouvain/openjpeg/issues/1053
+            */
+        OPJ_UINT32* OPJ_RESTRICT tiledp_u = (OPJ_UINT32*) tiledp;
+        OPJ_UINT32* OPJ_RESTRICT t1data = (OPJ_UINT32*) t1->data;
+        /* Change from "natural" order to "zigzag" order of T1 passes */
+        for (j = 0; j < (cblk_h & ~3U); j += 4) {
+            for (i = 0; i < cblk_w; ++i) {
+                t1data[0] = tiledp_u[(j + 0) * tile_w + i] << T1_NMSEDEC_FRACBITS;
+                t1data[1] = tiledp_u[(j + 1) * tile_w + i] << T1_NMSEDEC_FRACBITS;
+                t1data[2] = tiledp_u[(j + 2) * tile_w + i] << T1_NMSEDEC_FRACBITS;
+                t1data[3] = tiledp_u[(j + 3) * tile_w + i] << T1_NMSEDEC_FRACBITS;
+                t1data += 4;
+            }
+        }
+        if (j < cblk_h) {
+            for (i = 0; i < cblk_w; ++i) {
+                OPJ_UINT32 k;
+                for (k = j; k < cblk_h; k++) {
+                    t1data[0] = tiledp_u[k * tile_w + i] << T1_NMSEDEC_FRACBITS;
+                    t1data ++;
+                }
+            }
+        }
+    } else {        /* if (tccp->qmfbid == 0) */
+        OPJ_FLOAT32* OPJ_RESTRICT tiledp_f = (OPJ_FLOAT32*) tiledp;
+        OPJ_INT32* OPJ_RESTRICT t1data = t1->data;
+        /* Change from "natural" order to "zigzag" order of T1 passes */
+        for (j = 0; j < (cblk_h & ~3U); j += 4) {
+            for (i = 0; i < cblk_w; ++i) {
+                t1data[0] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 0) * tile_w + i] /
+                                                   band->stepsize) * (1 << T1_NMSEDEC_FRACBITS));
+                t1data[1] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 1) * tile_w + i] /
+                                                   band->stepsize) * (1 << T1_NMSEDEC_FRACBITS));
+                t1data[2] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 2) * tile_w + i] /
+                                                   band->stepsize) * (1 << T1_NMSEDEC_FRACBITS));
+                t1data[3] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 3) * tile_w + i] /
+                                                   band->stepsize) * (1 << T1_NMSEDEC_FRACBITS));
+                t1data += 4;
+            }
+        }
+        if (j < cblk_h) {
+            for (i = 0; i < cblk_w; ++i) {
+                OPJ_UINT32 k;
+                for (k = j; k < cblk_h; k++) {
+                    t1data[0] = (OPJ_INT32)opj_lrintf((tiledp_f[k * tile_w + i] / band->stepsize)
+                                                      * (1 << T1_NMSEDEC_FRACBITS));
+                    t1data ++;
+                }
+            }
+        }
+    }
+
+    {
+        OPJ_FLOAT64 cumwmsedec =
+            opj_t1_encode_cblk(
+                t1,
+                cblk,
+                band->bandno,
+                job->compno,
+                tilec->numresolutions - 1 - resno,
+                tccp->qmfbid,
+                band->stepsize,
+                tccp->cblksty,
+                job->tile->numcomps,
+                job->mct_norms,
+                job->mct_numcomps);
+        if (job->mutex) {
+            opj_mutex_lock(job->mutex);
+        }
+        job->tile->distotile += cumwmsedec;
+        if (job->mutex) {
+            opj_mutex_unlock(job->mutex);
+        }
+    }
+
+    opj_free(job);
+}
 
 
-OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1,
+OPJ_BOOL opj_t1_encode_cblks(opj_tcd_t* tcd,
                              opj_tcd_tile_t *tile,
                              opj_tcp_t *tcp,
                              const OPJ_FLOAT64 * mct_norms,
                              OPJ_UINT32 mct_numcomps
                             )
 {
+    volatile OPJ_BOOL ret = OPJ_TRUE;
+    opj_thread_pool_t* tp = tcd->thread_pool;
     OPJ_UINT32 compno, resno, bandno, precno, cblkno;
+    opj_mutex_t* mutex = opj_mutex_create();
 
     tile->distotile = 0;        /* fixed_quality */
 
     for (compno = 0; compno < tile->numcomps; ++compno) {
         opj_tcd_tilecomp_t* tilec = &tile->comps[compno];
         opj_tccp_t* tccp = &tcp->tccps[compno];
-        OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
 
         for (resno = 0; resno < tilec->numresolutions; ++resno) {
             opj_tcd_resolution_t *res = &tilec->resolutions[resno];
 
             for (bandno = 0; bandno < res->numbands; ++bandno) {
                 opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno];
-                OPJ_INT32 bandconst;
 
                 /* Skip empty bands */
                 if (opj_tcd_is_band_empty(band)) {
                     continue;
                 }
-
-                bandconst = 8192 * 8192 / ((OPJ_INT32) floor(band->stepsize * 8192));
                 for (precno = 0; precno < res->pw * res->ph; ++precno) {
                     opj_tcd_precinct_t *prc = &band->precincts[precno];
 
                     for (cblkno = 0; cblkno < prc->cw * prc->ch; ++cblkno) {
                         opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
-                        OPJ_INT32* OPJ_RESTRICT tiledp;
-                        OPJ_UINT32 cblk_w;
-                        OPJ_UINT32 cblk_h;
-                        OPJ_UINT32 i, j, tileLineAdvance;
-                        OPJ_SIZE_T tileIndex = 0;
 
-                        OPJ_INT32 x = cblk->x0 - band->x0;
-                        OPJ_INT32 y = cblk->y0 - band->y0;
-                        if (band->bandno & 1) {
-                            opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1];
-                            x += pres->x1 - pres->x0;
+                        opj_t1_cblk_encode_processing_job_t* job =
+                            (opj_t1_cblk_encode_processing_job_t*) opj_calloc(1,
+                                    sizeof(opj_t1_cblk_encode_processing_job_t));
+                        if (!job) {
+                            ret = OPJ_FALSE;
+                            goto end;
                         }
-                        if (band->bandno & 2) {
-                            opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1];
-                            y += pres->y1 - pres->y0;
-                        }
-
-                        if (!opj_t1_allocate_buffers(
-                                    t1,
-                                    (OPJ_UINT32)(cblk->x1 - cblk->x0),
-                                    (OPJ_UINT32)(cblk->y1 - cblk->y0))) {
-                            return OPJ_FALSE;
-                        }
-
-                        cblk_w = t1->w;
-                        cblk_h = t1->h;
-                        tileLineAdvance = tile_w - cblk_w;
-
-                        tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x];
-                        t1->data = tiledp;
-                        t1->data_stride = tile_w;
-                        if (tccp->qmfbid == 1) {
-                            /* Do multiplication on unsigned type, even if the
-                             * underlying type is signed, to avoid potential
-                             * int overflow on large value (the output will be
-                             * incorrect in such situation, but whatever...)
-                             * This assumes complement-to-2 signed integer
-                             * representation
-                             * Fixes https://github.com/uclouvain/openjpeg/issues/1053
-                             */
-                            OPJ_UINT32* OPJ_RESTRICT tiledp_u = (OPJ_UINT32*) tiledp;
-                            for (j = 0; j < cblk_h; ++j) {
-                                for (i = 0; i < cblk_w; ++i) {
-                                    tiledp_u[tileIndex] <<= T1_NMSEDEC_FRACBITS;
-                                    tileIndex++;
-                                }
-                                tileIndex += tileLineAdvance;
-                            }
-                        } else {        /* if (tccp->qmfbid == 0) */
-                            for (j = 0; j < cblk_h; ++j) {
-                                for (i = 0; i < cblk_w; ++i) {
-                                    OPJ_INT32 tmp = tiledp[tileIndex];
-                                    tiledp[tileIndex] =
-                                        opj_int_fix_mul_t1(
-                                            tmp,
-                                            bandconst);
-                                    tileIndex++;
-                                }
-                                tileIndex += tileLineAdvance;
-                            }
-                        }
-
-                        opj_t1_encode_cblk(
-                            t1,
-                            cblk,
-                            band->bandno,
-                            compno,
-                            tilec->numresolutions - 1 - resno,
-                            tccp->qmfbid,
-                            band->stepsize,
-                            tccp->cblksty,
-                            tile->numcomps,
-                            tile,
-                            mct_norms,
-                            mct_numcomps);
+                        job->compno = compno;
+                        job->tile = tile;
+                        job->resno = resno;
+                        job->cblk = cblk;
+                        job->band = band;
+                        job->tilec = tilec;
+                        job->tccp = tccp;
+                        job->mct_norms = mct_norms;
+                        job->mct_numcomps = mct_numcomps;
+                        job->pret = &ret;
+                        job->mutex = mutex;
+                        opj_thread_pool_submit_job(tp, opj_t1_cblk_encode_processor, job);
 
                     } /* cblkno */
                 } /* precno */
             } /* bandno */
         } /* resno  */
     } /* compno  */
-    return OPJ_TRUE;
+
+end:
+    opj_thread_pool_wait_completion(tcd->thread_pool, 0);
+    if (mutex) {
+        opj_mutex_destroy(mutex);
+    }
+
+    return ret;
 }
 
 /* Returns whether the pass (bpno, passtype) is terminated */
@@ -2252,18 +2365,17 @@ static int opj_t1_enc_is_term_pass(opj_tcd_cblk_enc_t* cblk,
 
 
 /** mod fixed_quality */
-static void opj_t1_encode_cblk(opj_t1_t *t1,
-                               opj_tcd_cblk_enc_t* cblk,
-                               OPJ_UINT32 orient,
-                               OPJ_UINT32 compno,
-                               OPJ_UINT32 level,
-                               OPJ_UINT32 qmfbid,
-                               OPJ_FLOAT64 stepsize,
-                               OPJ_UINT32 cblksty,
-                               OPJ_UINT32 numcomps,
-                               opj_tcd_tile_t * tile,
-                               const OPJ_FLOAT64 * mct_norms,
-                               OPJ_UINT32 mct_numcomps)
+static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
+                                      opj_tcd_cblk_enc_t* cblk,
+                                      OPJ_UINT32 orient,
+                                      OPJ_UINT32 compno,
+                                      OPJ_UINT32 level,
+                                      OPJ_UINT32 qmfbid,
+                                      OPJ_FLOAT64 stepsize,
+                                      OPJ_UINT32 cblksty,
+                                      OPJ_UINT32 numcomps,
+                                      const OPJ_FLOAT64 * mct_norms,
+                                      OPJ_UINT32 mct_numcomps)
 {
     OPJ_FLOAT64 cumwmsedec = 0.0;
 
@@ -2277,6 +2389,7 @@ static void opj_t1_encode_cblk(opj_t1_t *t1,
     OPJ_UINT32 i, j;
     OPJ_BYTE type = T1_TYPE_MQ;
     OPJ_FLOAT64 tempwmsedec;
+    OPJ_INT32* datap;
 
 #ifdef EXTRA_DEBUG
     printf("encode_cblk(x=%d,y=%d,x1=%d,y1=%d,orient=%d,compno=%d,level=%d\n",
@@ -2286,10 +2399,19 @@ static void opj_t1_encode_cblk(opj_t1_t *t1,
     mqc->lut_ctxno_zc_orient = lut_ctxno_zc + (orient << 9);
 
     max = 0;
-    for (i = 0; i < t1->w; ++i) {
-        for (j = 0; j < t1->h; ++j) {
-            OPJ_INT32 tmp = abs(t1->data[i + j * t1->data_stride]);
-            max = opj_int_max(max, tmp);
+    datap = t1->data;
+    for (j = 0; j < t1->h; ++j) {
+        const OPJ_UINT32 w = t1->w;
+        for (i = 0; i < w; ++i, ++datap) {
+            OPJ_INT32 tmp = *datap;
+            if (tmp < 0) {
+                OPJ_UINT32 tmp_unsigned;
+                max = opj_int_max(max, -tmp);
+                tmp_unsigned = opj_to_smr(tmp);
+                memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32));
+            } else {
+                max = opj_int_max(max, tmp);
+            }
         }
     }
 
@@ -2297,7 +2419,7 @@ static void opj_t1_encode_cblk(opj_t1_t *t1,
                                       T1_NMSEDEC_FRACBITS) : 0;
     if (cblk->numbps == 0) {
         cblk->totalpasses = 0;
-        return;
+        return cumwmsedec;
     }
 
     bpno = (OPJ_INT32)(cblk->numbps - 1);
@@ -2343,7 +2465,6 @@ static void opj_t1_encode_cblk(opj_t1_t *t1,
         tempwmsedec = opj_t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid,
                                         stepsize, numcomps, mct_norms, mct_numcomps) ;
         cumwmsedec += tempwmsedec;
-        tile->distotile += tempwmsedec;
         pass->distortiondec = cumwmsedec;
 
         if (opj_t1_enc_is_term_pass(cblk, cblksty, bpno, passtype)) {
@@ -2425,4 +2546,6 @@ static void opj_t1_encode_cblk(opj_t1_t *t1,
         }
     }
 #endif
+
+    return cumwmsedec;
 }
diff --git a/3rdparty/openjpeg/openjp2/t1.h b/3rdparty/openjpeg/openjp2/t1.h
index 171dfb0a7a..81ad0d00f1 100644
--- a/3rdparty/openjpeg/openjp2/t1.h
+++ b/3rdparty/openjpeg/openjp2/t1.h
@@ -198,7 +198,6 @@ typedef struct opj_t1 {
     OPJ_UINT32 h;
     OPJ_UINT32 datasize;
     OPJ_UINT32 flagssize;
-    OPJ_UINT32 data_stride;
     OPJ_BOOL   encoder;
 
     /* Thre 3 variables below are only used by the decoder */
@@ -216,13 +215,13 @@ typedef struct opj_t1 {
 
 /**
 Encode the code-blocks of a tile
-@param t1 T1 handle
+@param tcd TCD handle
 @param tile The tile to encode
 @param tcp Tile coding parameters
 @param mct_norms  FIXME DOC
 @param mct_numcomps Number of components used for MCT
 */
-OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1,
+OPJ_BOOL opj_t1_encode_cblks(opj_tcd_t* tcd,
                              opj_tcd_tile_t *tile,
                              opj_tcp_t *tcp,
                              const OPJ_FLOAT64 * mct_norms,
diff --git a/3rdparty/openjpeg/openjp2/t2.c b/3rdparty/openjpeg/openjp2/t2.c
index 9825118cfd..1481e16f46 100644
--- a/3rdparty/openjpeg/openjp2/t2.c
+++ b/3rdparty/openjpeg/openjp2/t2.c
@@ -224,6 +224,7 @@ OPJ_BOOL opj_t2_encode_packets(opj_t2_t* p_t2,
                                OPJ_UINT32 * p_data_written,
                                OPJ_UINT32 p_max_len,
                                opj_codestream_info_t *cstr_info,
+                               opj_tcd_marker_info_t* p_marker_info,
                                OPJ_UINT32 p_tp_num,
                                OPJ_INT32 p_tp_pos,
                                OPJ_UINT32 p_pino,
@@ -244,7 +245,7 @@ OPJ_BOOL opj_t2_encode_packets(opj_t2_t* p_t2,
                             l_image->numcomps : 1;
     OPJ_UINT32 l_nb_pocs = l_tcp->numpocs + 1;
 
-    l_pi = opj_pi_initialise_encode(l_image, l_cp, p_tile_no, p_t2_mode);
+    l_pi = opj_pi_initialise_encode(l_image, l_cp, p_tile_no, p_t2_mode, p_manager);
     if (!l_pi) {
         return OPJ_FALSE;
     }
@@ -310,6 +311,20 @@ OPJ_BOOL opj_t2_encode_packets(opj_t2_t* p_t2,
             opj_pi_destroy(l_pi, l_nb_pocs);
             return OPJ_FALSE;
         }
+
+        if (p_marker_info && p_marker_info->need_PLT) {
+            /* One time use intended */
+            assert(p_marker_info->packet_count == 0);
+            assert(p_marker_info->p_packet_size == NULL);
+
+            p_marker_info->p_packet_size = (OPJ_UINT32*) opj_malloc(
+                                               opj_get_encoding_packet_count(l_image, l_cp, p_tile_no) * sizeof(OPJ_UINT32));
+            if (p_marker_info->p_packet_size == NULL) {
+                opj_pi_destroy(l_pi, l_nb_pocs);
+                return OPJ_FALSE;
+            }
+        }
+
         while (opj_pi_next(l_current_pi)) {
             if (l_current_pi->layno < p_maxlayers) {
                 l_nb_bytes = 0;
@@ -326,6 +341,11 @@ OPJ_BOOL opj_t2_encode_packets(opj_t2_t* p_t2,
 
                 * p_data_written += l_nb_bytes;
 
+                if (p_marker_info && p_marker_info->need_PLT) {
+                    p_marker_info->p_packet_size[p_marker_info->packet_count] = l_nb_bytes;
+                    p_marker_info->packet_count ++;
+                }
+
                 /* INDEX >> */
                 if (cstr_info) {
                     if (cstr_info->index_write) {
@@ -405,7 +425,7 @@ OPJ_BOOL opj_t2_decode_packets(opj_tcd_t* tcd,
 #endif
 
     /* create a packet iterator */
-    l_pi = opj_pi_create_decode(l_image, l_cp, p_tile_no);
+    l_pi = opj_pi_create_decode(l_image, l_cp, p_tile_no, p_manager);
     if (!l_pi) {
         return OPJ_FALSE;
     }
@@ -673,6 +693,14 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
     OPJ_BOOL packet_empty = OPJ_FALSE;
 #endif
 
+#ifdef DEBUG_VERBOSE
+    if (p_t2_mode == FINAL_PASS) {
+        fprintf(stderr,
+                "encode packet compono=%d, resno=%d, precno=%d, layno=%d\n",
+                compno, resno, precno, layno);
+    }
+#endif
+
     /* <SOP 0xff91> */
     if (tcp->csty & J2K_CP_CSTY_SOP) {
         if (length < 6) {
@@ -711,6 +739,15 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
                 continue;
             }
 
+            /* Avoid out of bounds access of https://github.com/uclouvain/openjpeg/issues/1294 */
+            /* but likely not a proper fix. */
+            if (precno >= res->pw * res->ph) {
+                opj_event_msg(p_manager, EVT_ERROR,
+                              "opj_t2_encode_packet(): accessing precno=%u >= %u\n",
+                              precno, res->pw * res->ph);
+                return OPJ_FALSE;
+            }
+
             prc = &band->precincts[precno];
             opj_tgt_reset(prc->incltree);
             opj_tgt_reset(prc->imsbtree);
@@ -778,6 +815,15 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
             continue;
         }
 
+        /* Avoid out of bounds access of https://github.com/uclouvain/openjpeg/issues/1297 */
+        /* but likely not a proper fix. */
+        if (precno >= res->pw * res->ph) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "opj_t2_encode_packet(): accessing precno=%u >= %u\n",
+                          precno, res->pw * res->ph);
+            return OPJ_FALSE;
+        }
+
         prc = &band->precincts[precno];
         l_nb_blocks = prc->cw * prc->ch;
         cblk = prc->cblks.enc;
diff --git a/3rdparty/openjpeg/openjp2/t2.h b/3rdparty/openjpeg/openjp2/t2.h
index 66500b1699..becfa91a4d 100644
--- a/3rdparty/openjpeg/openjp2/t2.h
+++ b/3rdparty/openjpeg/openjp2/t2.h
@@ -73,6 +73,7 @@ Encode the packets of a tile to a destination buffer
 @param p_data_written   FIXME DOC
 @param len              the length of the destination buffer
 @param cstr_info        Codestream information structure
+@param p_marker_info    Marker information structure
 @param tpnum            Tile part number of the current tile
 @param tppos            The position of the tile part flag in the progression order
 @param pino             FIXME DOC
@@ -87,6 +88,7 @@ OPJ_BOOL opj_t2_encode_packets(opj_t2_t* t2,
                                OPJ_UINT32 * p_data_written,
                                OPJ_UINT32 len,
                                opj_codestream_info_t *cstr_info,
+                               opj_tcd_marker_info_t* p_marker_info,
                                OPJ_UINT32 tpnum,
                                OPJ_INT32 tppos,
                                OPJ_UINT32 pino,
diff --git a/3rdparty/openjpeg/openjp2/tcd.c b/3rdparty/openjpeg/openjp2/tcd.c
index be3b84363f..6442669d60 100644
--- a/3rdparty/openjpeg/openjp2/tcd.c
+++ b/3rdparty/openjpeg/openjp2/tcd.c
@@ -112,7 +112,7 @@ void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img)
  * Initializes tile coding/decoding
  */
 static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
-        OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block,
+        OPJ_BOOL isEncoder, OPJ_SIZE_T sizeof_block,
         opj_event_mgr_t* manager);
 
 /**
@@ -182,6 +182,7 @@ static OPJ_BOOL opj_tcd_t2_encode(opj_tcd_t *p_tcd,
                                   OPJ_UINT32 * p_data_written,
                                   OPJ_UINT32 p_max_dest_size,
                                   opj_codestream_info_t *p_cstr_info,
+                                  opj_tcd_marker_info_t* p_marker_info,
                                   opj_event_mgr_t *p_manager);
 
 static OPJ_BOOL opj_tcd_rate_allocate_encode(opj_tcd_t *p_tcd,
@@ -573,9 +574,10 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
                 opj_tcd_makelayer(tcd, layno, thresh, 0);
 
                 if (cp->m_specific_param.m_enc.m_fixed_quality) {       /* fixed_quality */
-                    if (OPJ_IS_CINEMA(cp->rsiz)) {
+                    if (OPJ_IS_CINEMA(cp->rsiz) || OPJ_IS_IMF(cp->rsiz)) {
                         if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest,
-                                                    p_data_written, maxlen, cstr_info, tcd->cur_tp_num, tcd->tp_pos, tcd->cur_pino,
+                                                    p_data_written, maxlen, cstr_info, NULL, tcd->cur_tp_num, tcd->tp_pos,
+                                                    tcd->cur_pino,
                                                     THRESH_CALC, p_manager)) {
 
                             lo = thresh;
@@ -605,7 +607,8 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
                     }
                 } else {
                     if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest,
-                                                p_data_written, maxlen, cstr_info, tcd->cur_tp_num, tcd->tp_pos, tcd->cur_pino,
+                                                p_data_written, maxlen, cstr_info, NULL, tcd->cur_tp_num, tcd->tp_pos,
+                                                tcd->cur_pino,
                                                 THRESH_CALC, p_manager)) {
                         /* TODO: what to do with l ??? seek / tell ??? */
                         /* opj_event_msg(tcd->cinfo, EVT_INFO, "rate alloc: len=%d, max=%d\n", l, maxlen); */
@@ -718,10 +721,9 @@ OPJ_BOOL opj_alloc_tile_component_data(opj_tcd_tilecomp_t *l_tilec)
 /* ----------------------------------------------------------------------- */
 
 static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
-        OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block,
+        OPJ_BOOL isEncoder, OPJ_SIZE_T sizeof_block,
         opj_event_mgr_t* manager)
 {
-    OPJ_UINT32(*l_gain_ptr)(OPJ_UINT32) = 00;
     OPJ_UINT32 compno, resno, bandno, precno, cblkno;
     opj_tcp_t * l_tcp = 00;
     opj_cp_t * l_cp = 00;
@@ -737,7 +739,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
     OPJ_UINT32 p, q;
     OPJ_UINT32 l_level_no;
     OPJ_UINT32 l_pdx, l_pdy;
-    OPJ_UINT32 l_gain;
     OPJ_INT32 l_x0b, l_y0b;
     OPJ_UINT32 l_tx0, l_ty0;
     /* extent of precincts , top left, bottom right**/
@@ -876,11 +877,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
         l_level_no = l_tilec->numresolutions;
         l_res = l_tilec->resolutions;
         l_step_size = l_tccp->stepsizes;
-        if (l_tccp->qmfbid == 0) {
-            l_gain_ptr = &opj_dwt_getgain_real;
-        } else {
-            l_gain_ptr  = &opj_dwt_getgain;
-        }
         /*fprintf(stderr, "\tlevel_no=%d\n",l_level_no);*/
 
         for (resno = 0; resno < l_tilec->numresolutions; ++resno) {
@@ -905,8 +901,24 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
             /* p. 64, B.6, ISO/IEC FDIS15444-1 : 2000 (18 august 2000)  */
             l_tl_prc_x_start = opj_int_floordivpow2(l_res->x0, (OPJ_INT32)l_pdx) << l_pdx;
             l_tl_prc_y_start = opj_int_floordivpow2(l_res->y0, (OPJ_INT32)l_pdy) << l_pdy;
-            l_br_prc_x_end = opj_int_ceildivpow2(l_res->x1, (OPJ_INT32)l_pdx) << l_pdx;
-            l_br_prc_y_end = opj_int_ceildivpow2(l_res->y1, (OPJ_INT32)l_pdy) << l_pdy;
+            {
+                OPJ_UINT32 tmp = ((OPJ_UINT32)opj_int_ceildivpow2(l_res->x1,
+                                  (OPJ_INT32)l_pdx)) << l_pdx;
+                if (tmp > (OPJ_UINT32)INT_MAX) {
+                    opj_event_msg(manager, EVT_ERROR, "Integer overflow\n");
+                    return OPJ_FALSE;
+                }
+                l_br_prc_x_end = (OPJ_INT32)tmp;
+            }
+            {
+                OPJ_UINT32 tmp = ((OPJ_UINT32)opj_int_ceildivpow2(l_res->y1,
+                                  (OPJ_INT32)l_pdy)) << l_pdy;
+                if (tmp > (OPJ_UINT32)INT_MAX) {
+                    opj_event_msg(manager, EVT_ERROR, "Integer overflow\n");
+                    return OPJ_FALSE;
+                }
+                l_br_prc_y_end = (OPJ_INT32)tmp;
+            }
             /*fprintf(stderr, "\t\t\tprc_x_start=%d, prc_y_start=%d, br_prc_x_end=%d, br_prc_y_end=%d \n", l_tl_prc_x_start, l_tl_prc_y_start, l_br_prc_x_end ,l_br_prc_y_end );*/
 
             l_res->pw = (l_res->x0 == l_res->x1) ? 0U : (OPJ_UINT32)((
@@ -951,7 +963,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
             l_band = l_res->bands;
 
             for (bandno = 0; bandno < l_res->numbands; ++bandno, ++l_band, ++l_step_size) {
-                OPJ_INT32 numbps;
                 /*fprintf(stderr, "\t\t\tband_no=%d/%d\n", bandno, l_res->numbands );*/
 
                 if (resno == 0) {
@@ -987,11 +998,24 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
                     }
                 }
 
-                /** avoid an if with storing function pointer */
-                l_gain = (*l_gain_ptr)(l_band->bandno);
-                numbps = (OPJ_INT32)(l_image_comp->prec + l_gain);
-                l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0,
-                                                  (OPJ_INT32)(numbps - l_step_size->expn)))) * fraction;
+                {
+                    /* Table E-1 - Sub-band gains */
+                    /* BUG_WEIRD_TWO_INVK (look for this identifier in dwt.c): */
+                    /* the test (!isEncoder && l_tccp->qmfbid == 0) is strongly */
+                    /* linked to the use of two_invK instead of invK */
+                    const OPJ_INT32 log2_gain = (!isEncoder &&
+                                                 l_tccp->qmfbid == 0) ? 0 : (l_band->bandno == 0) ? 0 :
+                                                (l_band->bandno == 3) ? 2 : 1;
+
+                    /* Nominal dynamic range. Equation E-4 */
+                    const OPJ_INT32 Rb = (OPJ_INT32)l_image_comp->prec + log2_gain;
+
+                    /* Delta_b value of Equation E-3 in "E.1 Inverse quantization
+                    * procedure" of the standard */
+                    l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0,
+                                                      (OPJ_INT32)(Rb - l_step_size->expn))));
+                }
+
                 /* Mb value of Equation E-2 in "E.1 Inverse quantization
                  * procedure" of the standard */
                 l_band->numbps = l_step_size->expn + (OPJ_INT32)l_tccp->numgbits -
@@ -1174,14 +1198,14 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
 OPJ_BOOL opj_tcd_init_encode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
                                   opj_event_mgr_t* p_manager)
 {
-    return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_TRUE, 1.0F,
+    return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_TRUE,
                              sizeof(opj_tcd_cblk_enc_t), p_manager);
 }
 
 OPJ_BOOL opj_tcd_init_decode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
                                   opj_event_mgr_t* p_manager)
 {
-    return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_FALSE, 0.5F,
+    return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_FALSE,
                              sizeof(opj_tcd_cblk_dec_t), p_manager);
 }
 
@@ -1219,10 +1243,16 @@ static OPJ_BOOL opj_tcd_code_block_enc_allocate_data(opj_tcd_cblk_enc_t *
 
     /* +1 is needed for https://github.com/uclouvain/openjpeg/issues/835 */
     /* and actually +2 required for https://github.com/uclouvain/openjpeg/issues/982 */
+    /* and +7 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 3) */
+    /* and +26 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 7) */
+    /* and +28 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 44) */
+    /* and +33 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 4) */
+    /* and +63 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 4 -IMF 2K) */
+    /* and +74 for https://github.com/uclouvain/openjpeg/issues/1283 (-M 4 -n 8 -s 7,7 -I) */
     /* TODO: is there a theoretical upper-bound for the compressed code */
     /* block size ? */
-    l_data_size = 2 + (OPJ_UINT32)((p_code_block->x1 - p_code_block->x0) *
-                                   (p_code_block->y1 - p_code_block->y0) * (OPJ_INT32)sizeof(OPJ_UINT32));
+    l_data_size = 74 + (OPJ_UINT32)((p_code_block->x1 - p_code_block->x0) *
+                                    (p_code_block->y1 - p_code_block->y0) * (OPJ_INT32)sizeof(OPJ_UINT32));
 
     if (l_data_size > p_code_block->data_size) {
         if (p_code_block->data) {
@@ -1354,6 +1384,7 @@ OPJ_BOOL opj_tcd_encode_tile(opj_tcd_t *p_tcd,
                              OPJ_UINT32 * p_data_written,
                              OPJ_UINT32 p_max_length,
                              opj_codestream_info_t *p_cstr_info,
+                             opj_tcd_marker_info_t* p_marker_info,
                              opj_event_mgr_t *p_manager)
 {
 
@@ -1433,7 +1464,7 @@ OPJ_BOOL opj_tcd_encode_tile(opj_tcd_t *p_tcd,
     /* FIXME _ProfStart(PGROUP_T2); */
 
     if (! opj_tcd_t2_encode(p_tcd, p_dest, p_data_written, p_max_length,
-                            p_cstr_info, p_manager)) {
+                            p_cstr_info, p_marker_info, p_manager)) {
         return OPJ_FALSE;
     }
     /* FIXME _ProfStop(PGROUP_T2); */
@@ -2017,7 +2048,8 @@ static OPJ_BOOL opj_tcd_mct_decode(opj_tcd_t *p_tcd, opj_event_mgr_t *p_manager)
     opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles;
     opj_tcp_t * l_tcp = p_tcd->tcp;
     opj_tcd_tilecomp_t * l_tile_comp = l_tile->comps;
-    OPJ_UINT32 l_samples, i;
+    OPJ_SIZE_T l_samples;
+    OPJ_UINT32 i;
 
     if (l_tcp->mct == 0 || p_tcd->used_component != NULL) {
         return OPJ_TRUE;
@@ -2030,8 +2062,8 @@ static OPJ_BOOL opj_tcd_mct_decode(opj_tcd_t *p_tcd, opj_event_mgr_t *p_manager)
         /* A bit inefficient: we process more data than needed if */
         /* resno_decoded < l_tile_comp->minimum_num_resolutions-1, */
         /* but we would need to take into account a stride then */
-        l_samples = (OPJ_UINT32)((res_comp0->x1 - res_comp0->x0) *
-                                 (res_comp0->y1 - res_comp0->y0));
+        l_samples = (OPJ_SIZE_T)(res_comp0->x1 - res_comp0->x0) *
+                    (OPJ_SIZE_T)(res_comp0->y1 - res_comp0->y0);
         if (l_tile->numcomps >= 3) {
             if (l_tile_comp->minimum_num_resolutions !=
                     l_tile->comps[1].minimum_num_resolutions ||
@@ -2065,8 +2097,8 @@ static OPJ_BOOL opj_tcd_mct_decode(opj_tcd_t *p_tcd, opj_event_mgr_t *p_manager)
         opj_tcd_resolution_t* res_comp0 = l_tile->comps[0].resolutions +
                                           p_tcd->image->comps[0].resno_decoded;
 
-        l_samples = (res_comp0->win_x1 - res_comp0->win_x0) *
-                    (res_comp0->win_y1 - res_comp0->win_y0);
+        l_samples = (OPJ_SIZE_T)(res_comp0->win_x1 - res_comp0->win_x0) *
+                    (OPJ_SIZE_T)(res_comp0->win_y1 - res_comp0->win_y0);
         if (l_tile->numcomps >= 3) {
             opj_tcd_resolution_t* res_comp1 = l_tile->comps[1].resolutions +
                                               p_tcd->image->comps[1].resno_decoded;
@@ -2332,7 +2364,7 @@ static void opj_tcd_code_block_enc_deallocate(opj_tcd_precinct_t * p_precinct)
     }
 }
 
-OPJ_SIZE_T opj_tcd_get_encoded_tile_size(opj_tcd_t *p_tcd)
+OPJ_SIZE_T opj_tcd_get_encoder_input_buffer_size(opj_tcd_t *p_tcd)
 {
     OPJ_UINT32 i;
     OPJ_SIZE_T l_data_size = 0;
@@ -2390,7 +2422,8 @@ static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd)
             }
         } else {
             for (i = 0; i < l_nb_elem; ++i) {
-                *l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11);
+                *((OPJ_FLOAT32 *) l_current_ptr) = (OPJ_FLOAT32)(*l_current_ptr -
+                                                   l_tccp->m_dc_level_shift);
                 ++l_current_ptr;
             }
         }
@@ -2448,8 +2481,11 @@ static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd)
 
         opj_free(l_data);
     } else if (l_tcp->tccps->qmfbid == 0) {
-        opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data,
-                            l_tile->comps[2].data, samples);
+        opj_mct_encode_real(
+            (OPJ_FLOAT32*)l_tile->comps[0].data,
+            (OPJ_FLOAT32*)l_tile->comps[1].data,
+            (OPJ_FLOAT32*)l_tile->comps[2].data,
+            samples);
     } else {
         opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data,
                        l_tile->comps[2].data, samples);
@@ -2467,11 +2503,11 @@ static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd)
 
     for (compno = 0; compno < l_tile->numcomps; ++compno) {
         if (l_tccp->qmfbid == 1) {
-            if (! opj_dwt_encode(l_tile_comp)) {
+            if (! opj_dwt_encode(p_tcd, l_tile_comp)) {
                 return OPJ_FALSE;
             }
         } else if (l_tccp->qmfbid == 0) {
-            if (! opj_dwt_encode_real(l_tile_comp)) {
+            if (! opj_dwt_encode_real(p_tcd, l_tile_comp)) {
                 return OPJ_FALSE;
             }
         }
@@ -2485,16 +2521,10 @@ static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd)
 
 static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd)
 {
-    opj_t1_t * l_t1;
     const OPJ_FLOAT64 * l_mct_norms;
     OPJ_UINT32 l_mct_numcomps = 0U;
     opj_tcp_t * l_tcp = p_tcd->tcp;
 
-    l_t1 = opj_t1_create(OPJ_TRUE);
-    if (l_t1 == 00) {
-        return OPJ_FALSE;
-    }
-
     if (l_tcp->mct == 1) {
         l_mct_numcomps = 3U;
         /* irreversible encoding */
@@ -2508,13 +2538,9 @@ static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd)
         l_mct_norms = (const OPJ_FLOAT64 *)(l_tcp->mct_norms);
     }
 
-    if (! opj_t1_encode_cblks(l_t1, p_tcd->tcd_image->tiles, l_tcp, l_mct_norms,
-                              l_mct_numcomps)) {
-        opj_t1_destroy(l_t1);
-        return OPJ_FALSE;
-    }
-
-    opj_t1_destroy(l_t1);
+    return opj_t1_encode_cblks(p_tcd,
+                               p_tcd->tcd_image->tiles, l_tcp, l_mct_norms,
+                               l_mct_numcomps);
 
     return OPJ_TRUE;
 }
@@ -2524,6 +2550,7 @@ static OPJ_BOOL opj_tcd_t2_encode(opj_tcd_t *p_tcd,
                                   OPJ_UINT32 * p_data_written,
                                   OPJ_UINT32 p_max_dest_size,
                                   opj_codestream_info_t *p_cstr_info,
+                                  opj_tcd_marker_info_t* p_marker_info,
                                   opj_event_mgr_t *p_manager)
 {
     opj_t2_t * l_t2;
@@ -2542,6 +2569,7 @@ static OPJ_BOOL opj_tcd_t2_encode(opj_tcd_t *p_tcd,
                 p_data_written,
                 p_max_dest_size,
                 p_cstr_info,
+                p_marker_info,
                 p_tcd->tp_num,
                 p_tcd->tp_pos,
                 p_tcd->cur_pino,
@@ -2600,7 +2628,7 @@ OPJ_BOOL opj_tcd_copy_tile_data(opj_tcd_t *p_tcd,
     OPJ_UINT32 l_size_comp, l_remaining;
     OPJ_SIZE_T l_nb_elem;
 
-    l_data_size = opj_tcd_get_encoded_tile_size(p_tcd);
+    l_data_size = opj_tcd_get_encoder_input_buffer_size(p_tcd);
     if (l_data_size != p_src_length) {
         return OPJ_FALSE;
     }
@@ -2802,3 +2830,30 @@ static OPJ_BOOL opj_tcd_is_whole_tilecomp_decoding(opj_tcd_t *p_tcd,
               (((OPJ_UINT32)tilec->x1 - tcx1) >> shift) == 0 &&
               (((OPJ_UINT32)tilec->y1 - tcy1) >> shift) == 0)));
 }
+
+/* ----------------------------------------------------------------------- */
+
+opj_tcd_marker_info_t* opj_tcd_marker_info_create(OPJ_BOOL need_PLT)
+{
+    opj_tcd_marker_info_t *l_tcd_marker_info =
+        (opj_tcd_marker_info_t*) opj_calloc(1, sizeof(opj_tcd_marker_info_t));
+    if (!l_tcd_marker_info) {
+        return NULL;
+    }
+
+    l_tcd_marker_info->need_PLT = need_PLT;
+
+    return l_tcd_marker_info;
+}
+
+/* ----------------------------------------------------------------------- */
+
+void opj_tcd_marker_info_destroy(opj_tcd_marker_info_t *p_tcd_marker_info)
+{
+    if (p_tcd_marker_info) {
+        opj_free(p_tcd_marker_info->p_packet_size);
+        opj_free(p_tcd_marker_info);
+    }
+}
+
+/* ----------------------------------------------------------------------- */
diff --git a/3rdparty/openjpeg/openjp2/tcd.h b/3rdparty/openjpeg/openjp2/tcd.h
index e3214c1d98..f1b52b8dac 100644
--- a/3rdparty/openjpeg/openjp2/tcd.h
+++ b/3rdparty/openjpeg/openjp2/tcd.h
@@ -284,6 +284,22 @@ typedef struct opj_tcd {
     OPJ_BOOL* used_component;
 } opj_tcd_t;
 
+/**
+ * Structure to hold information needed to generate some markers.
+ * Used by encoder.
+ */
+typedef struct opj_tcd_marker_info {
+    /** In: Whether information to generate PLT markers in needed */
+    OPJ_BOOL    need_PLT;
+
+    /** OUT: Number of elements in p_packet_size[] array */
+    OPJ_UINT32  packet_count;
+
+    /** OUT: Array of size packet_count, such that p_packet_size[i] is
+     *       the size in bytes of the ith packet */
+    OPJ_UINT32* p_packet_size;
+} opj_tcd_marker_info_t;
+
 /** @name Exported functions */
 /*@{*/
 /* ----------------------------------------------------------------------- */
@@ -306,6 +322,21 @@ Destroy a previously created TCD handle
 */
 void opj_tcd_destroy(opj_tcd_t *tcd);
 
+
+/**
+ * Create a new opj_tcd_marker_info_t* structure
+ * @param need_PLT Whether information is needed to generate PLT markers.
+ */
+opj_tcd_marker_info_t* opj_tcd_marker_info_create(OPJ_BOOL need_PLT);
+
+
+/**
+Destroy a previously created opj_tcd_marker_info_t* structure
+@param p_tcd_marker_info Structure to destroy
+*/
+void opj_tcd_marker_info_destroy(opj_tcd_marker_info_t *p_tcd_marker_info);
+
+
 /**
  * Initialize the tile coder and may reuse some memory.
  * @param   p_tcd       TCD handle.
@@ -364,6 +395,7 @@ OPJ_UINT32 opj_tcd_get_decoded_tile_size(opj_tcd_t *p_tcd,
  * @param   p_data_written  pointer to an int that is incremented by the number of bytes really written on p_dest
  * @param   p_len           Maximum length of the destination buffer
  * @param   p_cstr_info     Codestream information structure
+ * @param   p_marker_info   Marker information structure
  * @param   p_manager       the user event manager
  * @return  true if the coding is successful.
 */
@@ -373,6 +405,7 @@ OPJ_BOOL opj_tcd_encode_tile(opj_tcd_t *p_tcd,
                              OPJ_UINT32 * p_data_written,
                              OPJ_UINT32 p_len,
                              struct opj_codestream_info *p_cstr_info,
+                             opj_tcd_marker_info_t* p_marker_info,
                              opj_event_mgr_t *p_manager);
 
 
@@ -415,9 +448,11 @@ OPJ_BOOL opj_tcd_update_tile_data(opj_tcd_t *p_tcd,
                                   OPJ_UINT32 p_dest_length);
 
 /**
- *
+ * Get the size in bytes of the input buffer provided before encoded.
+ * This must be the size provided to the p_src_length argument of
+ * opj_tcd_copy_tile_data()
  */
-OPJ_SIZE_T opj_tcd_get_encoded_tile_size(opj_tcd_t *p_tcd);
+OPJ_SIZE_T opj_tcd_get_encoder_input_buffer_size(opj_tcd_t *p_tcd);
 
 /**
  * Initialize the tile coder and may reuse some meory.
@@ -433,6 +468,8 @@ OPJ_BOOL opj_tcd_init_encode_tile(opj_tcd_t *p_tcd,
 
 /**
  * Copies tile data from the given memory block onto the system.
+ *
+ * p_src_length must be equal to opj_tcd_get_encoder_input_buffer_size()
  */
 OPJ_BOOL opj_tcd_copy_tile_data(opj_tcd_t *p_tcd,
                                 OPJ_BYTE * p_src,
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index c71bf9faff..f249d2dcc3 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -153,6 +153,11 @@ set_target_properties(libprotobuf
     ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
     )
 
+if(ANDROID)
+  # https://github.com/opencv/opencv/issues/17282
+  target_link_libraries(libprotobuf INTERFACE "-landroid" "-llog")
+endif()
+
 get_protobuf_version(Protobuf_VERSION "${PROTOBUF_ROOT}/src")
 set(Protobuf_VERSION ${Protobuf_VERSION} CACHE INTERNAL "" FORCE)
 
diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index a085b0f3ca..50f3e6ccf1 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -170,4 +170,4 @@ ocv_install_target(tbb EXPORT OpenCVModules
 
 ocv_install_3rdparty_licenses(tbb "${tbb_src_dir}/LICENSE" "${tbb_src_dir}/README")
 
-ocv_tbb_read_version("${tbb_src_dir}/include")
+ocv_tbb_read_version("${tbb_src_dir}/include" tbb)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 487efd5f7e..78327c1a70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,9 +17,7 @@ endif()
 
 include(cmake/OpenCVMinDepVersions.cmake)
 
-if(CMAKE_GENERATOR MATCHES Xcode AND XCODE_VERSION VERSION_GREATER 4.3)
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-elseif(CMAKE_SYSTEM_NAME MATCHES WindowsPhone OR CMAKE_SYSTEM_NAME MATCHES WindowsStore)
+if(CMAKE_SYSTEM_NAME MATCHES WindowsPhone OR CMAKE_SYSTEM_NAME MATCHES WindowsStore)
   cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
   #Required to resolve linker error issues due to incompatibility with CMake v3.0+ policies.
   #CMake fails to find _fseeko() which leads to subsequent linker error.
@@ -215,7 +213,7 @@ OCV_OPTION(OPENCV_ENABLE_NONFREE "Enable non-free algorithms" OFF)
 OCV_OPTION(OPENCV_FORCE_3RDPARTY_BUILD   "Force using 3rdparty code from source" OFF)
 OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             (WIN32 OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
 OCV_OPTION(BUILD_TIFF               "Build libtiff from source"          (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
-OCV_OPTION(BUILD_OPENJPEG           "Build OpenJPEG from source"         (WIN32 OR ANDRIOD OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_OPENJPEG           "Build OpenJPEG from source"         (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
 OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
 OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
 OCV_OPTION(BUILD_PNG                "Build libpng from source"           (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
@@ -398,11 +396,11 @@ OCV_OPTION(WITH_OPENCL_D3D11_NV "Include NVIDIA OpenCL D3D11 support" WITH_DIREC
 OCV_OPTION(WITH_LIBREALSENSE "Include Intel librealsense support" OFF
   VISIBLE_IF NOT WITH_INTELPERC
   VERIFY HAVE_LIBREALSENSE)
-OCV_OPTION(WITH_VA "Include VA support" OFF
-  VISIBLE_IF UNIX AND NOT ANDROID
+OCV_OPTION(WITH_VA "Include VA support" (X86_64 OR X86)
+  VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
   VERIFY HAVE_VA)
-OCV_OPTION(WITH_VA_INTEL "Include Intel VA-API/OpenCL support" OFF
-  VISIBLE_IF UNIX AND NOT ANDROID
+OCV_OPTION(WITH_VA_INTEL "Include Intel VA-API/OpenCL support" (X86_64 OR X86)
+  VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
   VERIFY HAVE_VA_INTEL)
 OCV_OPTION(WITH_MFX "Include Intel Media SDK support" OFF
   VISIBLE_IF (UNIX AND NOT ANDROID) OR (WIN32 AND NOT WINRT AND NOT MINGW)
@@ -440,6 +438,9 @@ OCV_OPTION(WITH_QUIRC "Include library QR-code decoding" ON
 OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 20)
   VISIBLE_IF ANDROID
   VERIFY HAVE_ANDROID_MEDIANDK)
+OCV_OPTION(WITH_ANDROID_NATIVE_CAMERA "Use Android NDK for Camera I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 23)
+  VISIBLE_IF ANDROID
+  VERIFY HAVE_ANDROID_NATIVE_CAMERA)
 OCV_OPTION(WITH_TENGINE "Include Arm Inference Tengine support" OFF
   VISIBLE_IF (ARM OR AARCH64) AND (UNIX OR ANDROID) AND NOT IOS
   VERIFY HAVE_TENGINE)
@@ -480,7 +481,7 @@ OCV_OPTION(INSTALL_TESTS            "Install accuracy and performance test binar
 
 # OpenCV build options
 # ===================================================
-OCV_OPTION(ENABLE_CCACHE              "Use ccache"                                               (UNIX AND NOT IOS AND (CMAKE_GENERATOR MATCHES "Makefile" OR CMAKE_GENERATOR MATCHES "Ninja")) )
+OCV_OPTION(ENABLE_CCACHE              "Use ccache"                                               (UNIX AND (CMAKE_GENERATOR MATCHES "Makefile" OR CMAKE_GENERATOR MATCHES "Ninja" OR CMAKE_GENERATOR MATCHES "Xcode")) )
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  MSVC IF (MSVC OR (NOT IOS AND NOT CMAKE_CROSSCOMPILING) ) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CV_GCC )
@@ -489,7 +490,7 @@ OCV_OPTION(OPENCV_ENABLE_MEMORY_SANITIZER "Better support for memory/address san
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CV_GCC )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable compiler options for fast math optimizations on FP computations (not recommended)" OFF)
-if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS))  # Use CPU_BASELINE instead
+if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS) AND CMAKE_CROSSCOMPILING)  # Use CPU_BASELINE instead
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
 endif()
@@ -668,9 +669,18 @@ if(UNIX)
       CHECK_SYMBOL_EXISTS(memalign malloc.h HAVE_MEMALIGN)
     endif()
     # TODO:
-    # - _aligned_malloc() on Win32
     # - std::aligned_alloc() C++17 / C11
   endif()
+elseif(WIN32)
+  include(CheckIncludeFile)
+  include(CheckSymbolExists)
+
+  if(OPENCV_ENABLE_MEMALIGN)
+    CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
+    if(HAVE_MALLOC_H)
+      CHECK_SYMBOL_EXISTS(_aligned_malloc malloc.h HAVE_WIN32_ALIGNED_MALLOC)
+    endif()
+  endif()
 endif()
 
 include(cmake/OpenCVPCHSupport.cmake)
@@ -736,8 +746,15 @@ if(ENABLE_FLAKE8 AND PYTHON_DEFAULT_AVAILABLE)
     include("${CMAKE_CURRENT_LIST_DIR}/cmake/FindFlake8.cmake")
   endif()
   if(FLAKE8_FOUND)
+    list(APPEND OPENCV_FLAKE8_EXCLUDES ".git" "__pycache__" "config.py" "*.config.py" "config-*.py")
+    list(APPEND OPENCV_FLAKE8_EXCLUDES "svgfig.py")  # 3rdparty
+    if(NOT PYTHON3_VERSION_STRING VERSION_GREATER 3.6)
+      # Python 3.6+ (PEP 526): variable annotations (type hints)
+      list(APPEND OPENCV_FLAKE8_EXCLUDES "samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs")
+    endif()
+    string(REPLACE ";" "," OPENCV_FLAKE8_EXCLUDES_STR "${OPENCV_FLAKE8_EXCLUDES}")
     add_custom_target(check_flake8
-        COMMAND "${FLAKE8_EXECUTABLE}" . --count --select=E9,E901,E999,F821,F822,F823 --show-source --statistics --exclude='.git,__pycache__,*.config.py,svgfig.py'
+        COMMAND "${FLAKE8_EXECUTABLE}" . --count --select=E9,E901,E999,F821,F822,F823 --show-source --statistics --exclude='${OPENCV_FLAKE8_EXCLUDES_STR}'
         WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
         COMMENT "Running flake8"
     )
@@ -989,6 +1006,12 @@ if(COMMAND ocv_pylint_finalize)
   ocv_pylint_add_directory_recurse(${CMAKE_CURRENT_LIST_DIR}/samples/python/tutorial_code)
   ocv_pylint_finalize()
 endif()
+if(TARGET check_pylint)
+  message(STATUS "Registered 'check_pylint' target: using ${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}), checks: ${PYLINT_TOTAL_TARGETS}")
+endif()
+if(TARGET check_flake8)
+  message(STATUS "Registered 'check_flake8' target: using ${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})")
+endif()
 
 if(OPENCV_GENERATE_SETUPVARS)
   include(cmake/OpenCVGenSetupVars.cmake)
@@ -1047,7 +1070,9 @@ endif()
 if(CMAKE_GENERATOR MATCHES Xcode)
   status("    Xcode:"          ${XCODE_VERSION})
 endif()
-if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
+if(CMAKE_GENERATOR MATCHES "Xcode|Visual Studio|Multi-Config")
+  status("    Configuration:"  ${CMAKE_CONFIGURATION_TYPES})
+else()
   status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
 
@@ -1620,12 +1645,6 @@ endif()
 
 status("")
 status("  Python (for build):"  PYTHON_DEFAULT_AVAILABLE THEN "${PYTHON_DEFAULT_EXECUTABLE}" ELSE NO)
-if(PYLINT_FOUND AND PYLINT_EXECUTABLE)
-  status("    Pylint:"  PYLINT_FOUND THEN "${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}, checks: ${PYLINT_TOTAL_TARGETS})" ELSE NO)
-endif()
-if(FLAKE8_FOUND AND FLAKE8_EXECUTABLE)
-  status("    Flake8:"  FLAKE8_FOUND THEN "${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})" ELSE NO)
-endif()
 
 # ========================== java ==========================
 if(BUILD_JAVA)
diff --git a/README.md b/README.md
index 0653a9e73e..b9897205ba 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,8 @@
 * Homepage: <https://opencv.org>
   * Courses: <https://opencv.org/courses>
 * Docs: <https://docs.opencv.org/master/>
-* Q&A forum: <http://answers.opencv.org>
+* Q&A forum: <https://forum.opencv.org>
+  * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
 * Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib> 
 
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 1504fa61c4..e9a7be7c66 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -59,3 +59,4 @@ ocv_add_app(annotation)
 ocv_add_app(visualisation)
 ocv_add_app(interactive-calibration)
 ocv_add_app(version)
+ocv_add_app(model-diagnostics)
diff --git a/apps/model-diagnostics/CMakeLists.txt b/apps/model-diagnostics/CMakeLists.txt
new file mode 100644
index 0000000000..b48f8264ff
--- /dev/null
+++ b/apps/model-diagnostics/CMakeLists.txt
@@ -0,0 +1,3 @@
+ocv_add_application(opencv_model_diagnostics
+    MODULES opencv_core opencv_dnn
+    SRCS model_diagnostics.cpp)
diff --git a/apps/model-diagnostics/model_diagnostics.cpp b/apps/model-diagnostics/model_diagnostics.cpp
new file mode 100644
index 0000000000..2ffeaa1ea5
--- /dev/null
+++ b/apps/model-diagnostics/model_diagnostics.cpp
@@ -0,0 +1,65 @@
+/*************************************************
+USAGE:
+./model_diagnostics -m <onnx file location>
+**************************************************/
+#include <opencv2/dnn.hpp>
+#include <opencv2/core/utils/filesystem.hpp>
+
+#include <iostream>
+
+
+using namespace cv;
+using namespace dnn;
+
+
+static
+int diagnosticsErrorCallback(int /*status*/, const char* /*func_name*/,
+                             const char* /*err_msg*/, const char* /*file_name*/,
+                             int /*line*/, void* /*userdata*/)
+{
+    fflush(stdout);
+    fflush(stderr);
+    return 0;
+}
+
+static std::string checkFileExists(const std::string& fileName)
+{
+    if (fileName.empty() || utils::fs::exists(fileName))
+        return fileName;
+
+    CV_Error(Error::StsObjectNotFound, "File " + fileName + " was not found! "
+         "Please, specify a full path to the file.");
+}
+
+std::string diagnosticKeys =
+        "{ model m     | | Path to the model .onnx file. }"
+        "{ config c    | | Path to the model configuration file. }"
+        "{ framework f | | [Optional] Name of the model framework. }";
+
+
+
+int main( int argc, const char** argv )
+{
+    CommandLineParser argParser(argc, argv, diagnosticKeys);
+    argParser.about("Use this tool to run the diagnostics of provided ONNX model"
+                    "to obtain the information about its support (supported layers).");
+
+    if (argc == 1)
+    {
+        argParser.printMessage();
+        return 0;
+    }
+
+    std::string model = checkFileExists(argParser.get<std::string>("model"));
+    std::string config = checkFileExists(argParser.get<std::string>("config"));
+    std::string frameworkId = argParser.get<std::string>("framework");
+
+    CV_Assert(!model.empty());
+
+    enableModelDiagnostics(true);
+    redirectError(diagnosticsErrorCallback, NULL);
+
+    Net ocvNet = readNet(model, config, frameworkId);
+
+    return 0;
+}
diff --git a/cmake/FindCUDNN.cmake b/cmake/FindCUDNN.cmake
index 90d8b2ea78..195781b957 100644
--- a/cmake/FindCUDNN.cmake
+++ b/cmake/FindCUDNN.cmake
@@ -72,14 +72,14 @@ if(CUDNN_INCLUDE_DIR)
   endif()
 
   string(REGEX MATCH "define CUDNN_MAJOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
-  set(CUDNN_MAJOR_VERSION ${CMAKE_MATCH_1} CACHE INTERNAL "")
+  set(CUDNN_VERSION_MAJOR ${CMAKE_MATCH_1} CACHE INTERNAL "")
   string(REGEX MATCH "define CUDNN_MINOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
-  set(CUDNN_MINOR_VERSION ${CMAKE_MATCH_1} CACHE INTERNAL "")
+  set(CUDNN_VERSION_MINOR ${CMAKE_MATCH_1} CACHE INTERNAL "")
   string(REGEX MATCH "define CUDNN_PATCHLEVEL ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
-  set(CUDNN_PATCH_VERSION ${CMAKE_MATCH_1} CACHE INTERNAL "")
+  set(CUDNN_VERSION_PATCH ${CMAKE_MATCH_1} CACHE INTERNAL "")
 
   set(CUDNN_VERSION
-    "${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}"
+    "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}"
     CACHE
     STRING
     "cuDNN version"
diff --git a/cmake/FindFlake8.cmake b/cmake/FindFlake8.cmake
index b18225a011..8063571393 100644
--- a/cmake/FindFlake8.cmake
+++ b/cmake/FindFlake8.cmake
@@ -12,9 +12,11 @@
 
 find_host_program(FLAKE8_EXECUTABLE flake8 PATHS /usr/bin)
 
-if(FLAKE8_EXECUTABLE)
-  execute_process(COMMAND ${FLAKE8_EXECUTABLE} --version OUTPUT_VARIABLE FLAKE8_VERSION_RAW ERROR_QUIET)
-  if(FLAKE8_VERSION_RAW MATCHES "^([0-9\\.]+[0-9])")
+if(FLAKE8_EXECUTABLE AND NOT DEFINED FLAKE8_VERSION)
+  execute_process(COMMAND ${FLAKE8_EXECUTABLE} --version RESULT_VARIABLE _result OUTPUT_VARIABLE FLAKE8_VERSION_RAW)
+  if(NOT _result EQUAL 0)
+    ocv_clear_vars(FLAKE8_EXECUTABLE FLAKE8_VERSION)
+  elseif(FLAKE8_VERSION_RAW MATCHES "^([0-9\\.]+[0-9])")
     set(FLAKE8_VERSION "${CMAKE_MATCH_1}")
   else()
     set(FLAKE8_VERSION "unknown")
@@ -22,6 +24,9 @@ if(FLAKE8_EXECUTABLE)
 endif()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(Flake8 DEFAULT_MSG FLAKE8_EXECUTABLE)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(Flake8
+    REQUIRED_VARS FLAKE8_EXECUTABLE
+    VERSION_VAR FLAKE8_VERSION
+)
 
 mark_as_advanced(FLAKE8_EXECUTABLE FLAKE8_VERSION)
diff --git a/cmake/FindPylint.cmake b/cmake/FindPylint.cmake
index 7e26fe246e..ef4b4394ff 100644
--- a/cmake/FindPylint.cmake
+++ b/cmake/FindPylint.cmake
@@ -12,9 +12,11 @@
 
 find_host_program(PYLINT_EXECUTABLE pylint PATHS /usr/bin)
 
-if(PYLINT_EXECUTABLE)
-  execute_process(COMMAND ${PYLINT_EXECUTABLE} --version OUTPUT_VARIABLE PYLINT_VERSION_RAW ERROR_QUIET)
-  if(PYLINT_VERSION_RAW MATCHES "pylint([^,]*) ([0-9\\.]+[0-9])")
+if(PYLINT_EXECUTABLE AND NOT DEFINED PYLINT_VERSION)
+  execute_process(COMMAND ${PYLINT_EXECUTABLE} --version RESULT_VARIABLE _result OUTPUT_VARIABLE PYLINT_VERSION_RAW)
+  if(NOT _result EQUAL 0)
+    ocv_clear_vars(PYLINT_EXECUTABLE PYLINT_VERSION)
+  elseif(PYLINT_VERSION_RAW MATCHES "pylint([^,\n]*) ([0-9\\.]+[0-9])")
     set(PYLINT_VERSION "${CMAKE_MATCH_2}")
   else()
     set(PYLINT_VERSION "unknown")
@@ -22,6 +24,9 @@ if(PYLINT_EXECUTABLE)
 endif()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(Pylint DEFAULT_MSG PYLINT_EXECUTABLE)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(Pylint
+    REQUIRED_VARS PYLINT_EXECUTABLE
+    VERSION_VAR PYLINT_VERSION
+)
 
 mark_as_advanced(PYLINT_EXECUTABLE PYLINT_VERSION)
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 929c5b5e51..40a058d74e 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -8,13 +8,27 @@ function(access_CMAKE_COMPILER_IS_CCACHE)
   endif()
 endfunction()
 variable_watch(CMAKE_COMPILER_IS_CCACHE access_CMAKE_COMPILER_IS_CCACHE)
-if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE AND NOT CMAKE_GENERATOR MATCHES "Xcode")
+if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE)
   # This works fine with Unix Makefiles and Ninja generators
   find_host_program(CCACHE_PROGRAM ccache)
   if(CCACHE_PROGRAM)
     message(STATUS "Looking for ccache - found (${CCACHE_PROGRAM})")
     get_property(__OLD_RULE_LAUNCH_COMPILE GLOBAL PROPERTY RULE_LAUNCH_COMPILE)
-    if(__OLD_RULE_LAUNCH_COMPILE)
+    if(CMAKE_GENERATOR MATCHES "Xcode")
+      configure_file("${CMAKE_CURRENT_LIST_DIR}/templates/xcode-launch-c.in" "${CMAKE_BINARY_DIR}/xcode-launch-c")
+      configure_file("${CMAKE_CURRENT_LIST_DIR}/templates/xcode-launch-cxx.in" "${CMAKE_BINARY_DIR}/xcode-launch-cxx")
+      execute_process(COMMAND chmod a+rx
+          "${CMAKE_BINARY_DIR}/xcode-launch-c"
+          "${CMAKE_BINARY_DIR}/xcode-launch-cxx"
+      )
+      # Xcode project attributes
+      set(CMAKE_XCODE_ATTRIBUTE_CC         "${CMAKE_BINARY_DIR}/xcode-launch-c")
+      set(CMAKE_XCODE_ATTRIBUTE_CXX        "${CMAKE_BINARY_DIR}/xcode-launch-cxx")
+      set(CMAKE_XCODE_ATTRIBUTE_LD         "${CMAKE_BINARY_DIR}/xcode-launch-c")
+      set(CMAKE_XCODE_ATTRIBUTE_LDPLUSPLUS "${CMAKE_BINARY_DIR}/xcode-launch-cxx")
+      set(OPENCV_COMPILER_IS_CCACHE 1)
+      message(STATUS "ccache: enable support through Xcode project properties")
+    elseif(__OLD_RULE_LAUNCH_COMPILE)
       message(STATUS "Can't replace CMake compiler launcher")
     else()
       set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
@@ -122,7 +136,6 @@ if(CV_GCC OR CV_CLANG)
   endif()
   add_extra_compiler_option(-Wsign-promo)
   add_extra_compiler_option(-Wuninitialized)
-  add_extra_compiler_option(-Winit-self)
   if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
     add_extra_compiler_option(-Wno-psabi)
   endif()
@@ -291,7 +304,9 @@ if(MSVC)
   endif()
 endif()
 
-include(cmake/OpenCVCompilerOptimizations.cmake)
+if(PROJECT_NAME STREQUAL "OpenCV")
+  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVCompilerOptimizations.cmake")
+endif()
 if(COMMAND ocv_compiler_optimization_options)
   ocv_compiler_optimization_options()
 endif()
@@ -398,11 +413,11 @@ if(APPLE AND NOT CMAKE_CROSSCOMPILING AND NOT DEFINED ENV{LDFLAGS} AND EXISTS "/
 endif()
 
 if(ENABLE_BUILD_HARDENING)
-  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVCompilerDefenses.cmake)
+  include("${CMAKE_CURRENT_LIST_DIR}/OpenCVCompilerDefenses.cmake")
 endif()
 
 if(MSVC)
-  include(cmake/OpenCVCRTLinkage.cmake)
+  include("${CMAKE_CURRENT_LIST_DIR}/OpenCVCRTLinkage.cmake")
   add_definitions(-D_VARIADIC_MAX=10)
 endif()
 
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index c838a40409..216c02c3cc 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -62,7 +62,13 @@ function(add_custom_ie_build _inc _lib _lib_rel _lib_dbg _msg)
     if(find_prefix STREQUAL "_empty_")  # foreach doesn't iterate over empty elements
       set(find_prefix "")
     endif()
-    foreach(find_suffix ${CMAKE_FIND_LIBRARY_SUFFIXES})
+    if(NOT DEFINED INFERENCE_ENGINE_FIND_LIBRARY_SUFFIXES)  # allow custom override
+      set(INFERENCE_ENGINE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+      if(APPLE)
+        ocv_list_filterout(INFERENCE_ENGINE_FIND_LIBRARY_SUFFIXES "^.so$")  # skip plugins (can't be linked)
+      endif()
+    endif()
+    foreach(find_suffix ${INFERENCE_ENGINE_FIND_LIBRARY_SUFFIXES})
       ocv_ie_find_extra_libraries("${find_prefix}" "${find_suffix}")
     endforeach()
     if(NOT CMAKE_FIND_LIBRARY_SUFFIXES)
@@ -129,9 +135,9 @@ endif()
 
 if(INF_ENGINE_TARGET)
   if(NOT INF_ENGINE_RELEASE)
-    message(WARNING "InferenceEngine version has not been set, 2021.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    message(WARNING "InferenceEngine version has not been set, 2021.3 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
   endif()
-  set(INF_ENGINE_RELEASE "2021010000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
+  set(INF_ENGINE_RELEASE "2021030000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
     INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )
diff --git a/cmake/OpenCVDetectTBB.cmake b/cmake/OpenCVDetectTBB.cmake
index 38137f44f0..fc564e981a 100644
--- a/cmake/OpenCVDetectTBB.cmake
+++ b/cmake/OpenCVDetectTBB.cmake
@@ -1,4 +1,4 @@
-# Search TBB library (4.1 - 4.4, 2017)
+# Search TBB library: 4.1 - 4.4, 2017-2020, 2021+ (oneTBB)
 #
 # Own TBB (3rdparty/tbb):
 # - set cmake option BUILD_TBB to ON
@@ -19,7 +19,7 @@
 #   - "tbb" target exists and added to OPENCV_LINKER_LIBS
 
 function(ocv_tbb_cmake_guess _found)
-    find_package(TBB QUIET COMPONENTS tbb PATHS "$ENV{TBBROOT}/cmake")
+    find_package(TBB QUIET COMPONENTS tbb PATHS "$ENV{TBBROOT}/cmake" "$ENV{TBBROOT}/lib/cmake/tbb")
     if(TBB_FOUND)
       if(NOT TARGET TBB::tbb)
         message(WARNING "No TBB::tbb target found!")
@@ -28,11 +28,11 @@ function(ocv_tbb_cmake_guess _found)
       get_target_property(_lib TBB::tbb IMPORTED_LOCATION_RELEASE)
       message(STATUS "Found TBB (cmake): ${_lib}")
       get_target_property(_inc TBB::tbb INTERFACE_INCLUDE_DIRECTORIES)
-      ocv_tbb_read_version("${_inc}")
       add_library(tbb INTERFACE IMPORTED)
       set_target_properties(tbb PROPERTIES
         INTERFACE_LINK_LIBRARIES TBB::tbb
       )
+      ocv_tbb_read_version("${_inc}" tbb)
       set(${_found} TRUE PARENT_SCOPE)
     endif()
 endfunction()
@@ -66,7 +66,6 @@ function(ocv_tbb_env_guess _found)
   find_library(TBB_ENV_LIB_DEBUG NAMES "tbb_debug")
   if (TBB_ENV_INCLUDE AND (TBB_ENV_LIB OR TBB_ENV_LIB_DEBUG))
     ocv_tbb_env_verify()
-    ocv_tbb_read_version("${TBB_ENV_INCLUDE}")
     add_library(tbb UNKNOWN IMPORTED)
     set_target_properties(tbb PROPERTIES
       IMPORTED_LOCATION "${TBB_ENV_LIB}"
@@ -82,12 +81,23 @@ function(ocv_tbb_env_guess _found)
       get_filename_component(_dir "${TBB_ENV_LIB}" DIRECTORY)
       set_target_properties(tbb PROPERTIES INTERFACE_LINK_LIBRARIES "-L${_dir}")
     endif()
+    ocv_tbb_read_version("${TBB_ENV_INCLUDE}" tbb)
+    if(NOT (TBB_INTERFACE_VERSION LESS 12000))  # >= 12000, oneTBB 2021+
+      # avoid "defaultlib" requirement of tbb12.lib (we are using absolute path to 'tbb.lib' only)
+      # https://github.com/oneapi-src/oneTBB/blame/2dba2072869a189b9fdab3ffa431d3ea49059a19/include/oneapi/tbb/detail/_config.h#L334
+      if(NOT (CMAKE_VERSION VERSION_LESS "3.16.0"))  # https://gitlab.kitware.com/cmake/cmake/-/issues/19434
+        target_compile_definitions(tbb INTERFACE "__TBB_NO_IMPLICIT_LINKAGE=1")
+      else()
+        set_target_properties(tbb PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1")
+      endif()
+    endif()
     message(STATUS "Found TBB (env): ${TBB_ENV_LIB}")
     set(${_found} TRUE PARENT_SCOPE)
   endif()
 endfunction()
 
-function(ocv_tbb_read_version _path)
+function(ocv_tbb_read_version _path _tgt)
+  find_file(TBB_VER_FILE oneapi/tbb/version.h "${_path}" NO_DEFAULT_PATH CMAKE_FIND_ROOT_PATH_BOTH)
   find_file(TBB_VER_FILE tbb/tbb_stddef.h "${_path}" NO_DEFAULT_PATH CMAKE_FIND_ROOT_PATH_BOTH)
   ocv_parse_header("${TBB_VER_FILE}" TBB_VERSION_LINES TBB_VERSION_MAJOR TBB_VERSION_MINOR TBB_INTERFACE_VERSION CACHE)
 endfunction()
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 9bc215f415..6bcd81d8b4 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -143,10 +143,25 @@ macro(ipp_detect_version)
         list(APPEND IPP_LIBRARIES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX})
       else ()
         add_library(ipp${name} STATIC IMPORTED)
+        set(_filename "${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}")
         set_target_properties(ipp${name} PROPERTIES
           IMPORTED_LINK_INTERFACE_LIBRARIES ""
-          IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
+          IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${_filename}
         )
+        if("${name}" STREQUAL "core")  # https://github.com/opencv/opencv/pull/19681
+          if(OPENCV_FORCE_IPP_EXCLUDE_LIBS OR OPENCV_FORCE_IPP_EXCLUDE_LIBS_CORE
+              OR (UNIX AND NOT ANDROID AND NOT APPLE
+                  AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang|Intel"
+              )
+              AND NOT OPENCV_SKIP_IPP_EXCLUDE_LIBS_CORE
+          )
+            if(CMAKE_VERSION VERSION_LESS "3.13.0")
+              set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,${_filename} ${CMAKE_SHARED_LINKER_FLAGS}")
+            else()
+              target_link_options(ipp${name} INTERFACE "LINKER:--exclude-libs,${_filename}")
+            endif()
+          endif()
+        endif()
         list(APPEND IPP_LIBRARIES ipp${name})
         if (NOT BUILD_SHARED_LIBS AND (HAVE_IPP_ICV OR ";${OPENCV_INSTALL_EXTERNAL_DEPENDENCIES};" MATCHES ";ipp;"))
           # CMake doesn't support "install(TARGETS ${IPP_PREFIX}${name} " command with imported targets
diff --git a/cmake/OpenCVFindLAPACK.cmake b/cmake/OpenCVFindLAPACK.cmake
index 3fa23ef83f..4ff2dee4d8 100644
--- a/cmake/OpenCVFindLAPACK.cmake
+++ b/cmake/OpenCVFindLAPACK.cmake
@@ -51,17 +51,53 @@ macro(ocv_lapack_make_hdr _cblas_hdr _lapacke_hdr)
 endmacro()
 
 macro(ocv_lapack_run_check)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio"  # MSBuild
+      AND LAPACK_IMPL STREQUAL "MKL"
+      AND ";${LAPACK_LIBRARIES};" MATCHES ";tbb;" AND TARGET tbb
+      AND DEFINED TBB_INTERFACE_VERSION AND NOT (TBB_INTERFACE_VERSION LESS 12000)  # oneTBB/oneAPI workaround
+  )
+    # workaround DEFAULTLIB:tbb12.lib issue
+    get_target_property(_tbb_lib tbb IMPORTED_LOCATION)
+    if(NOT _tbb_lib)
+      get_target_property(_tbb_lib tbb IMPORTED_LOCATION_RELEASE)
+    endif()
+    if(_tbb_lib AND NOT OPENCV_SKIP_WORKAROUND_MKL_LINK_DIRECTORIES_TBB)
+      # MSBuild drops content of 'LIB' environment variable,
+      # so pass TBB library directory through `link_directories()`
+      get_filename_component(_tbb_lib_dir "${_tbb_lib}" DIRECTORY)
+      message(STATUS "MKL: adding '${_tbb_lib_dir}' to link directories (workaround DEFAULTLIB issue)")
+      link_directories("${_tbb_lib_dir}")
+    elseif(NOT OPENCV_SKIP_WORKAROUND_MKL_DEFAULTLIB)
+      # We may have tbb.lib for 'tbb' target, but not 'tbb12.lib'
+      ocv_update(OPENCV_MKL_IGNORE_DEFAULTLIB_TBB "tbb12.lib")
+      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /NODEFAULTLIB:${OPENCV_MKL_IGNORE_DEFAULTLIB_TBB}")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:${OPENCV_MKL_IGNORE_DEFAULTLIB_TBB}")
+    endif()
+  endif()
+
+  # TODO add cache for try_compile() inputs/results
+
+  get_property(__link_directories DIRECTORY PROPERTY LINK_DIRECTORIES)
+  if(LAPACK_LINK_LIBRARIES)
+    list(APPEND __link_directories ${LAPACK_LINK_LIBRARIES})
+  endif()
+
   try_compile(__VALID_LAPACK
-    "${OpenCV_BINARY_DIR}"
-    "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp"
-    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LAPACK_INCLUDE_DIR}\;${CMAKE_BINARY_DIR}"
-                "-DLINK_DIRECTORIES:STRING=${LAPACK_LINK_LIBRARIES}"
-                "-DLINK_LIBRARIES:STRING=${LAPACK_LIBRARIES}"
-    OUTPUT_VARIABLE TRY_OUT
+      "${OpenCV_BINARY_DIR}"
+      "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp"
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LAPACK_INCLUDE_DIR}\;${CMAKE_BINARY_DIR}"
+                  "-DLINK_DIRECTORIES:STRING=${__link_directories}"
+      LINK_LIBRARIES ${LAPACK_LIBRARIES}
+      OUTPUT_VARIABLE TRY_OUT
   )
   if(NOT __VALID_LAPACK)
-    #message(FATAL_ERROR "LAPACK: check build log:\n${TRY_OUT}")
-    message(STATUS "${LAPACK_IMPL}: Can't build LAPACK check code. This LAPACK version is not supported.")
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+        "\nLAPACK(${LAPACK_IMPL}) check FAILED:\n"
+        "    LAPACK_INCLUDE_DIR: '${LAPACK_INCLUDE_DIR}'\n"
+        "    LAPACK_LIBRARIES: '${LAPACK_LIBRARIES}'\n"
+        "    LAPACK_LINK_LIBRARIES: '${__link_directories}'\n"
+        "    Output:\n${TRY_OUT}\n\n")
+    message(STATUS "LAPACK(${LAPACK_IMPL}): Can't build LAPACK check code. This LAPACK version is not supported.")
     unset(LAPACK_LIBRARIES)
   else()
     message(STATUS "${LAPACK_IMPL}: Support is enabled.")
diff --git a/cmake/OpenCVFindLibsGrfmt.cmake b/cmake/OpenCVFindLibsGrfmt.cmake
index 28aa47ba9c..23a6ca6959 100644
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@@ -6,9 +6,18 @@
 if(BUILD_ZLIB)
   ocv_clear_vars(ZLIB_FOUND)
 else()
+  ocv_clear_internal_cache_vars(ZLIB_LIBRARY ZLIB_INCLUDE_DIR)
+  if(ANDROID)
+    set(_zlib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+    set(CMAKE_FIND_LIBRARY_SUFFIXES .so)
+  endif()
   find_package(ZLIB "${MIN_VER_ZLIB}")
+  if(ANDROID)
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ${_zlib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+    unset(_zlib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
+  endif()
   if(ZLIB_FOUND AND ANDROID)
-    if(ZLIB_LIBRARIES MATCHES "/usr/(lib|lib32|lib64)/libz.so$")
+    if(ZLIB_LIBRARIES MATCHES "/usr/lib.*/libz.so$")
       set(ZLIB_LIBRARIES z)
     endif()
   endif()
@@ -31,11 +40,12 @@ if(WITH_JPEG)
   if(BUILD_JPEG)
     ocv_clear_vars(JPEG_FOUND)
   else()
+    ocv_clear_internal_cache_vars(JPEG_LIBRARY JPEG_INCLUDE_DIR)
     include(FindJPEG)
   endif()
 
   if(NOT JPEG_FOUND)
-    ocv_clear_vars(JPEG_LIBRARY JPEG_LIBRARIES JPEG_INCLUDE_DIR)
+    ocv_clear_vars(JPEG_LIBRARY JPEG_INCLUDE_DIR)
 
     if(NOT BUILD_JPEG_TURBO_DISABLE)
       set(JPEG_LIBRARY libjpeg-turbo CACHE INTERNAL "")
@@ -76,6 +86,7 @@ if(WITH_TIFF)
   if(BUILD_TIFF)
     ocv_clear_vars(TIFF_FOUND)
   else()
+    ocv_clear_internal_cache_vars(TIFF_LIBRARY TIFF_INCLUDE_DIR)
     include(FindTIFF)
     if(TIFF_FOUND)
       ocv_parse_header("${TIFF_INCLUDE_DIR}/tiff.h" TIFF_VERSION_LINES TIFF_VERSION_CLASSIC TIFF_VERSION_BIG TIFF_VERSION TIFF_BIGTIFF_VERSION)
@@ -119,6 +130,7 @@ if(WITH_WEBP)
   if(BUILD_WEBP)
     ocv_clear_vars(WEBP_FOUND WEBP_LIBRARY WEBP_LIBRARIES WEBP_INCLUDE_DIR)
   else()
+    ocv_clear_internal_cache_vars(WEBP_LIBRARY WEBP_INCLUDE_DIR)
     include(cmake/OpenCVFindWebP.cmake)
     if(WEBP_FOUND)
       set(HAVE_WEBP 1)
@@ -212,6 +224,7 @@ if(WITH_PNG)
   if(BUILD_PNG)
     ocv_clear_vars(PNG_FOUND)
   else()
+    ocv_clear_internal_cache_vars(PNG_LIBRARY PNG_INCLUDE_DIR)
     include(FindPNG)
     if(PNG_FOUND)
       include(CheckIncludeFile)
@@ -243,6 +256,7 @@ endif()
 if(WITH_OPENEXR)
   ocv_clear_vars(HAVE_OPENEXR)
   if(NOT BUILD_OPENEXR)
+    ocv_clear_internal_cache_vars(OPENEXR_INCLUDE_PATHS OPENEXR_LIBRARIES OPENEXR_ILMIMF_LIBRARY OPENEXR_VERSION)
     include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindOpenEXR.cmake")
   endif()
 
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 3753084d28..a191afde58 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -29,7 +29,7 @@ if(WITH_IPP)
     if(OPENCV_FORCE_IPP_EXCLUDE_LIBS
         OR (HAVE_IPP_ICV
             AND UNIX AND NOT ANDROID AND NOT APPLE
-            AND (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang|Intel"
         )
         AND NOT OPENCV_SKIP_IPP_EXCLUDE_LIBS
     )
diff --git a/cmake/OpenCVFindMKL.cmake b/cmake/OpenCVFindMKL.cmake
index 141481ed42..00fd637ca1 100644
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@@ -3,7 +3,14 @@
 # installation/package
 #
 # Parameters:
-# MKL_WITH_TBB
+# MKL_ROOT_DIR / ENV{MKLROOT}
+# MKL_INCLUDE_DIR
+# MKL_LIBRARIES
+# MKL_USE_SINGLE_DYNAMIC_LIBRARY - use single dynamic library mkl_rt.lib / libmkl_rt.so
+# MKL_WITH_TBB / MKL_WITH_OPENMP
+#
+# Extra:
+# MKL_LIB_FIND_PATHS
 #
 # On return this will define:
 #
@@ -13,12 +20,6 @@
 # MKL_LIBRARIES     - MKL libraries that are used by OpenCV
 #
 
-macro (mkl_find_lib VAR NAME DIRS)
-    find_path(${VAR} ${NAME} ${DIRS} NO_DEFAULT_PATH)
-    set(${VAR} ${${VAR}}/${NAME})
-    unset(${VAR} CACHE)
-endmacro()
-
 macro(mkl_fail)
     set(HAVE_MKL OFF)
     set(MKL_ROOT_DIR "${MKL_ROOT_DIR}" CACHE PATH "Path to MKL directory")
@@ -39,43 +40,50 @@ macro(get_mkl_version VERSION_FILE)
     set(MKL_VERSION_STR "${MKL_VERSION_MAJOR}.${MKL_VERSION_MINOR}.${MKL_VERSION_UPDATE}" CACHE STRING "MKL version" FORCE)
 endmacro()
 
+OCV_OPTION(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use MKL Single Dynamic Library thorugh mkl_rt.lib / libmkl_rt.so" OFF)
+OCV_OPTION(MKL_WITH_TBB "Use MKL with TBB multithreading" OFF)#ON IF WITH_TBB)
+OCV_OPTION(MKL_WITH_OPENMP "Use MKL with OpenMP multithreading" OFF)#ON IF WITH_OPENMP)
 
-if(NOT DEFINED MKL_USE_MULTITHREAD)
-    OCV_OPTION(MKL_WITH_TBB "Use MKL with TBB multithreading" OFF)#ON IF WITH_TBB)
-    OCV_OPTION(MKL_WITH_OPENMP "Use MKL with OpenMP multithreading" OFF)#ON IF WITH_OPENMP)
+if(NOT MKL_ROOT_DIR AND DEFINED MKL_INCLUDE_DIR AND EXISTS "${MKL_INCLUDE_DIR}/mkl.h")
+  file(TO_CMAKE_PATH "${MKL_INCLUDE_DIR}" MKL_INCLUDE_DIR)
+  get_filename_component(MKL_ROOT_DIR "${MKL_INCLUDE_DIR}/.." ABSOLUTE)
+endif()
+if(NOT MKL_ROOT_DIR)
+  file(TO_CMAKE_PATH "${MKL_ROOT_DIR}" mkl_root_paths)
+  if(DEFINED ENV{MKLROOT})
+      file(TO_CMAKE_PATH "$ENV{MKLROOT}" path)
+      list(APPEND mkl_root_paths "${path}")
+  endif()
+
+  if(WITH_MKL AND NOT mkl_root_paths)
+    if(WIN32)
+      set(ProgramFilesx86 "ProgramFiles(x86)")
+      file(TO_CMAKE_PATH "$ENV{${ProgramFilesx86}}" path)
+      list(APPEND mkl_root_paths ${path}/IntelSWTools/compilers_and_libraries/windows/mkl)
+    endif()
+    if(UNIX)
+      list(APPEND mkl_root_paths "/opt/intel/mkl")
+    endif()
+  endif()
+
+  find_path(MKL_ROOT_DIR include/mkl.h PATHS ${mkl_root_paths})
 endif()
 
-#check current MKL_ROOT_DIR
 if(NOT MKL_ROOT_DIR OR NOT EXISTS "${MKL_ROOT_DIR}/include/mkl.h")
-    set(mkl_root_paths "${MKL_ROOT_DIR}")
-    if(DEFINED ENV{MKLROOT})
-        list(APPEND mkl_root_paths "$ENV{MKLROOT}")
-    endif()
-
-    if(WITH_MKL AND NOT mkl_root_paths)
-      if(WIN32)
-        set(ProgramFilesx86 "ProgramFiles(x86)")
-        list(APPEND mkl_root_paths $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows/mkl)
-      endif()
-      if(UNIX)
-        list(APPEND mkl_root_paths "/opt/intel/mkl")
-      endif()
-    endif()
-
-    find_path(MKL_ROOT_DIR include/mkl.h PATHS ${mkl_root_paths})
+  mkl_fail()
 endif()
 
-set(MKL_INCLUDE_DIRS "${MKL_ROOT_DIR}/include" CACHE PATH "Path to MKL include directory")
+set(MKL_INCLUDE_DIR "${MKL_ROOT_DIR}/include" CACHE PATH "Path to MKL include directory")
 
 if(NOT MKL_ROOT_DIR
     OR NOT EXISTS "${MKL_ROOT_DIR}"
-    OR NOT EXISTS "${MKL_INCLUDE_DIRS}"
-    OR NOT EXISTS "${MKL_INCLUDE_DIRS}/mkl_version.h"
+    OR NOT EXISTS "${MKL_INCLUDE_DIR}"
+    OR NOT EXISTS "${MKL_INCLUDE_DIR}/mkl_version.h"
 )
-    mkl_fail()
+  mkl_fail()
 endif()
 
-get_mkl_version(${MKL_INCLUDE_DIRS}/mkl_version.h)
+get_mkl_version(${MKL_INCLUDE_DIR}/mkl_version.h)
 
 #determine arch
 if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
@@ -95,52 +103,73 @@ else()
     set(MKL_ARCH_SUFFIX "c")
 endif()
 
-if(MKL_VERSION_STR VERSION_GREATER "11.3.0" OR MKL_VERSION_STR VERSION_EQUAL "11.3.0")
-    set(mkl_lib_find_paths
-        ${MKL_ROOT_DIR}/lib)
-    foreach(MKL_ARCH ${MKL_ARCH_LIST})
-      list(APPEND mkl_lib_find_paths
-        ${MKL_ROOT_DIR}/lib/${MKL_ARCH}
-        ${MKL_ROOT_DIR}/../tbb/lib/${MKL_ARCH}
-        ${MKL_ROOT_DIR}/${MKL_ARCH})
-    endforeach()
+set(mkl_lib_find_paths ${MKL_LIB_FIND_PATHS} ${MKL_ROOT_DIR}/lib)
+foreach(MKL_ARCH ${MKL_ARCH_LIST})
+  list(APPEND mkl_lib_find_paths
+    ${MKL_ROOT_DIR}/lib/${MKL_ARCH}
+    ${MKL_ROOT_DIR}/${MKL_ARCH}
+  )
+endforeach()
 
-    set(mkl_lib_list "mkl_intel_${MKL_ARCH_SUFFIX}")
+if(DEFINED OPENCV_MKL_LIBRARIES)
+  # custom list, user specified
+  set(mkl_lib_list ${OPENCV_MKL_LIBRARIES})
 
-    if(MKL_WITH_TBB)
-        list(APPEND mkl_lib_list mkl_tbb_thread tbb)
-    elseif(MKL_WITH_OPENMP)
-        if(MSVC)
-            list(APPEND mkl_lib_list mkl_intel_thread libiomp5md)
-        else()
-            list(APPEND mkl_lib_list mkl_gnu_thread)
-        endif()
+elseif(MKL_USE_SINGLE_DYNAMIC_LIBRARY AND NOT (MKL_VERSION_STR VERSION_LESS "10.3.0"))
+
+  # https://software.intel.com/content/www/us/en/develop/articles/a-new-linking-model-single-dynamic-library-mkl_rt-since-intel-mkl-103.html
+  set(mkl_lib_list "mkl_rt")
+
+elseif(NOT (MKL_VERSION_STR VERSION_LESS "11.3.0"))
+
+  set(mkl_lib_list "mkl_intel_${MKL_ARCH_SUFFIX}")
+
+  if(MKL_WITH_TBB)
+    list(APPEND mkl_lib_list mkl_tbb_thread)
+  elseif(MKL_WITH_OPENMP)
+    if(MSVC)
+      list(APPEND mkl_lib_list mkl_intel_thread libiomp5md)
     else()
-        list(APPEND mkl_lib_list mkl_sequential)
+      list(APPEND mkl_lib_list mkl_gnu_thread)
     endif()
+  else()
+    list(APPEND mkl_lib_list mkl_sequential)
+  endif()
 
-    list(APPEND mkl_lib_list mkl_core)
+  list(APPEND mkl_lib_list mkl_core)
 else()
-    message(STATUS "MKL version ${MKL_VERSION_STR} is not supported")
-    mkl_fail()
+  message(STATUS "MKL version ${MKL_VERSION_STR} is not supported")
+  mkl_fail()
 endif()
 
-set(MKL_LIBRARIES "")
-foreach(lib ${mkl_lib_list})
-    find_library(${lib} NAMES ${lib} ${lib}_dll HINTS ${mkl_lib_find_paths})
-    mark_as_advanced(${lib})
-    if(NOT ${lib})
-        mkl_fail()
+if(NOT MKL_LIBRARIES)
+  set(MKL_LIBRARIES "")
+  foreach(lib ${mkl_lib_list})
+    set(lib_var_name MKL_LIBRARY_${lib})
+    find_library(${lib_var_name} NAMES ${lib} ${lib}_dll HINTS ${mkl_lib_find_paths})
+    mark_as_advanced(${lib_var_name})
+    if(NOT ${lib_var_name})
+      mkl_fail()
     endif()
-    list(APPEND MKL_LIBRARIES ${${lib}})
-endforeach()
+    list(APPEND MKL_LIBRARIES ${${lib_var_name}})
+  endforeach()
+  list(APPEND MKL_LIBRARIES ${OPENCV_EXTRA_MKL_LIBRARIES})
+endif()
+
+if(MKL_WITH_TBB)
+  if(BUILD_TBB)
+    message(STATUS "MKL: reusing builtin TBB binaries is not supported. Consider disabling MKL_WITH_TBB flag to prevent build/runtime errors")
+  else()
+    list(APPEND MKL_LIBRARIES tbb)  # tbb target is expected
+  endif()
+endif()
 
 message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
 set(HAVE_MKL ON)
 set(MKL_ROOT_DIR "${MKL_ROOT_DIR}" CACHE PATH "Path to MKL directory")
-set(MKL_INCLUDE_DIRS "${MKL_INCLUDE_DIRS}" CACHE PATH "Path to MKL include directory")
-set(MKL_LIBRARIES "${MKL_LIBRARIES}" CACHE STRING "MKL libraries")
-if(UNIX AND NOT MKL_LIBRARIES_DONT_HACK)
+set(MKL_INCLUDE_DIRS "${MKL_INCLUDE_DIR}")
+set(MKL_LIBRARIES "${MKL_LIBRARIES}")
+if(UNIX AND NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY AND NOT MKL_LIBRARIES_DONT_HACK)
     #it's ugly but helps to avoid cyclic lib problem
     set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
 endif()
diff --git a/cmake/OpenCVFindVA.cmake b/cmake/OpenCVFindVA.cmake
index 9d0ceec2c5..08d034f690 100644
--- a/cmake/OpenCVFindVA.cmake
+++ b/cmake/OpenCVFindVA.cmake
@@ -2,21 +2,20 @@
 #   HAVE_VA - libva is available
 #   HAVE_VA_INTEL - OpenCL/libva Intel interoperability extension is available
 
-if(UNIX AND NOT ANDROID)
-    find_path(
+find_path(
     VA_INCLUDE_DIR
     NAMES va/va.h
-    PATHS "/usr/include"
+    PATHS ${VA_ROOT_DIR}
     PATH_SUFFIXES include
-    DOC "Path to libva headers")
-endif()
+    DOC "Path to libva headers"
+)
 
 if(VA_INCLUDE_DIR)
     set(HAVE_VA TRUE)
-    if(NOT DEFINED VA_LIBRARIES)
+    if(NOT DEFINED VA_LIBRARIES AND NOT OPENCV_LIBVA_LINK)
       set(VA_LIBRARIES "va" "va-drm")
     endif()
 else()
     set(HAVE_VA FALSE)
-    message(WARNING "libva installation is not found.")
+    message(STATUS "libva: missing va.h header (VA_INCLUDE_DIR)")
 endif()
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index bd14aa2378..224953a1f3 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -98,15 +98,6 @@ macro(ocv_add_dependencies full_modname)
   endforeach()
   unset(__depsvar)
 
-  # hack for python
-  set(__python_idx)
-  list(FIND OPENCV_MODULE_${full_modname}_WRAPPERS "python" __python_idx)
-  if (NOT __python_idx EQUAL -1)
-    list(REMOVE_ITEM OPENCV_MODULE_${full_modname}_WRAPPERS "python")
-    list(APPEND OPENCV_MODULE_${full_modname}_WRAPPERS "python_bindings_generator" "python2" "python3")
-  endif()
-  unset(__python_idx)
-
   ocv_list_unique(OPENCV_MODULE_${full_modname}_REQ_DEPS)
   ocv_list_unique(OPENCV_MODULE_${full_modname}_OPT_DEPS)
   ocv_list_unique(OPENCV_MODULE_${full_modname}_PRIVATE_REQ_DEPS)
@@ -210,11 +201,6 @@ macro(ocv_add_module _name)
       set(OPENCV_MODULES_DISABLED_USER ${OPENCV_MODULES_DISABLED_USER} "${the_module}" CACHE INTERNAL "List of OpenCV modules explicitly disabled by user")
     endif()
 
-    # add reverse wrapper dependencies
-    foreach (wrapper ${OPENCV_MODULE_${the_module}_WRAPPERS})
-      ocv_add_dependencies(opencv_${wrapper} OPTIONAL ${the_module})
-    endforeach()
-
     # stop processing of current file
     ocv_cmake_hook(POST_ADD_MODULE)
     ocv_cmake_hook(POST_ADD_MODULE_${the_module})
@@ -501,6 +487,21 @@ function(__ocv_resolve_dependencies)
     endforeach()
   endif()
 
+  # add reverse wrapper dependencies (BINDINDS)
+  foreach(the_module ${OPENCV_MODULES_BUILD})
+    foreach (wrapper ${OPENCV_MODULE_${the_module}_WRAPPERS})
+      if(wrapper STREQUAL "python")  # hack for python (BINDINDS)
+        ocv_add_dependencies(opencv_python2 OPTIONAL ${the_module})
+        ocv_add_dependencies(opencv_python3 OPTIONAL ${the_module})
+      else()
+        ocv_add_dependencies(opencv_${wrapper} OPTIONAL ${the_module})
+      endif()
+      if(DEFINED OPENCV_MODULE_opencv_${wrapper}_bindings_generator_CLASS)
+        ocv_add_dependencies(opencv_${wrapper}_bindings_generator OPTIONAL ${the_module})
+      endif()
+    endforeach()
+  endforeach()
+
   # disable MODULES with unresolved dependencies
   set(has_changes ON)
   while(has_changes)
@@ -878,7 +879,9 @@ endmacro()
 macro(_ocv_create_module)
 
   ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
-  set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
+  set(__module_headers ${OPENCV_MODULE_${the_module}_HEADERS})
+  list(SORT __module_headers)  # fix headers order, useful for bindings
+  set(OPENCV_MODULE_${the_module}_HEADERS ${__module_headers} CACHE INTERNAL "List of header files for ${the_module}")
   set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
 
   # The condition we ought to be testing here is whether ocv_add_precompiled_headers will
diff --git a/cmake/OpenCVPluginStandalone.cmake b/cmake/OpenCVPluginStandalone.cmake
new file mode 100644
index 0000000000..15b7a8085e
--- /dev/null
+++ b/cmake/OpenCVPluginStandalone.cmake
@@ -0,0 +1,131 @@
+# Standalone OpenCV plugins build scripts
+#
+# Useful OpenCV common build variables:
+# - CMAKE_BUILD_TYPE=Release/Debug
+# - BUILD_WITH_DEBUG_INFO=ON
+# - ENABLE_BUILD_HARDENING=ON
+#
+# Plugin configuration variables:
+# - OPENCV_PLUGIN_DEPS - set of extra dependencies (modules), used for include dirs, target_link_libraries
+# - OPENCV_PLUGIN_SUFFIX
+# - OPENCV_PLUGIN_NAME
+# - OPENCV_PLUGIN_OUTPUT_NAME_FULL (overrides both OPENCV_PLUGIN_NAME / OPENCV_PLUGIN_SUFFIX)
+#
+#=============================================
+
+if(NOT OpenCV_SOURCE_DIR)
+  message(FATAL_ERROR "OpenCV_SOURCE_DIR must be set to build the plugin!")
+endif()
+
+if(NOT DEFINED CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+message(STATUS "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
+set(BUILD_SHARED_LIBS ON CACHE BOOL "")
+if(NOT BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "Static plugin build does not make sense")
+endif()
+
+# re-use OpenCV build scripts
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVUtils.cmake")
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCXXCompiler.cmake")
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVCompilerOptions.cmake")
+
+function(ocv_create_plugin module default_name dependency_target dependency_target_desc)
+
+  set(OPENCV_PLUGIN_NAME ${default_name} CACHE STRING "")
+  set(OPENCV_PLUGIN_DESTINATION "" CACHE PATH "")
+  project(${OPENCV_PLUGIN_NAME} LANGUAGES CXX)
+
+  if(NOT TARGET ${dependency_target})
+    message(FATAL_ERROR "${dependency_target_desc} was not found! (missing target ${dependency_target})")
+  endif()
+
+  set(modules_ROOT "${OpenCV_SOURCE_DIR}/modules")
+  set(module_ROOT "${modules_ROOT}/${module}")
+
+  foreach(src ${ARGN})
+    list(APPEND sources "${module_ROOT}/${src}")
+  endforeach()
+
+  add_library(${OPENCV_PLUGIN_NAME} MODULE
+      "${sources}"
+      ${OPENCV_PLUGIN_EXTRA_SRC_FILES}
+  )
+
+  if(OPENCV_PLUGIN_DEPS)
+    foreach(d ${OPENCV_PLUGIN_DEPS})
+      list(APPEND OPENCV_PLUGIN_EXTRA_INCLUDES "${modules_ROOT}/${d}/include")
+    endforeach()
+  endif()
+
+  target_include_directories(${OPENCV_PLUGIN_NAME} PRIVATE
+      "${CMAKE_CURRENT_BINARY_DIR}"
+      "${module_ROOT}/src"
+      "${module_ROOT}/include"
+      ${OPENCV_PLUGIN_EXTRA_INCLUDES}
+  )
+  target_compile_definitions(${OPENCV_PLUGIN_NAME} PRIVATE "BUILD_PLUGIN=1")
+
+  target_link_libraries(${OPENCV_PLUGIN_NAME} PRIVATE ${dependency_target})
+  set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES
+    CXX_STANDARD 11
+    CXX_VISIBILITY_PRESET hidden
+  )
+
+  if(DEFINED OPENCV_PLUGIN_MODULE_PREFIX)
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES PREFIX "${OPENCV_PLUGIN_MODULE_PREFIX}")
+  endif()
+
+  if(APPLE)
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  elseif(WIN32)
+    # Hack for Windows only, Linux/MacOS uses global symbol table (without exact .so binding)
+    find_package(OpenCV REQUIRED ${module} ${OPENCV_PLUGIN_DEPS})
+    target_link_libraries(${OPENCV_PLUGIN_NAME} PRIVATE ${OpenCV_LIBRARIES})
+  endif()
+
+  if(NOT OpenCV_FOUND)  # build against sources (Linux)
+    file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/opencv2/opencv_modules.hpp" "#pragma once")
+  endif()
+
+  if(WIN32)
+    ocv_update(OPENCV_DEBUG_POSTFIX d)
+  endif()
+  set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}")
+
+  if(DEFINED OPENCV_PLUGIN_SUFFIX)
+    # custom value
+  else()
+    if(WIN32)
+      ocv_update(OPENCV_PLUGIN_VERSION "${OpenCV_VERSION_MAJOR}${OpenCV_VERSION_MINOR}${OpenCV_VERSION_PATCH}")
+      if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
+        ocv_update(OPENCV_PLUGIN_ARCH "_64")
+      else()
+        ocv_update(OPENCV_PLUGIN_ARCH "")
+      endif()
+    else()
+      # empty
+    endif()
+    ocv_update(OPENCV_PLUGIN_SUFFIX "${OPENCV_PLUGIN_VERSION}${OPENCV_PLUGIN_ARCH}")
+  endif()
+
+  if(OPENCV_PLUGIN_DESTINATION)
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OPENCV_PLUGIN_DESTINATION}")
+    message(STATUS "Output destination: ${OPENCV_PLUGIN_DESTINATION}")
+  endif()
+
+  if(OPENCV_PLUGIN_OUTPUT_NAME_FULL)
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES OUTPUT_NAME "${OPENCV_PLUGIN_OUTPUT_NAME_FULL}")
+  elseif(OPENCV_PLUGIN_OUTPUT_NAME)
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES OUTPUT_NAME "${OPENCV_PLUGIN_OUTPUT_NAME}${OPENCV_PLUGIN_SUFFIX}")
+  else()
+    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES OUTPUT_NAME "${OPENCV_PLUGIN_NAME}${OPENCV_PLUGIN_SUFFIX}")
+  endif()
+
+  install(TARGETS ${OPENCV_PLUGIN_NAME} LIBRARY DESTINATION . COMPONENT plugins)
+
+  message(STATUS "Library name: ${OPENCV_PLUGIN_NAME}")
+
+endfunction()
diff --git a/cmake/OpenCVPylint.cmake b/cmake/OpenCVPylint.cmake
index 50da730946..928926d340 100644
--- a/cmake/OpenCVPylint.cmake
+++ b/cmake/OpenCVPylint.cmake
@@ -122,7 +122,6 @@ function(ocv_pylint_finalize)
 
   list(LENGTH PYLINT_TARGET_ID __total)
   set(PYLINT_TOTAL_TARGETS "${__total}" CACHE INTERNAL "")
-  message(STATUS "Pylint: registered ${__total} targets. Build 'check_pylint' target to run checks (\"cmake --build . --target check_pylint\" or \"make check_pylint\")")
   configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/pylint.cmake.in" "${CMAKE_BINARY_DIR}/pylint.cmake" @ONLY)
 
   add_custom_target(check_pylint
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 2ad380236c..0951e06581 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -8,7 +8,20 @@ include(CMakeParseArguments)
 function(ocv_cmake_dump_vars)
   set(OPENCV_SUPPRESS_DEPRECATIONS 1)  # suppress deprecation warnings from variable_watch() guards
   get_cmake_property(__variableNames VARIABLES)
-  cmake_parse_arguments(DUMP "" "TOFILE" "" ${ARGN})
+  cmake_parse_arguments(DUMP "FORCE" "TOFILE" "" ${ARGN})
+
+  # avoid generation of excessive logs with "--trace" or "--trace-expand" parameters
+  # Note: `-DCMAKE_TRACE_MODE=1` should be passed to CMake through command line. It is not a CMake buildin variable for now (2020-12)
+  #       Use `cmake . -UCMAKE_TRACE_MODE` to remove this variable from cache
+  if(CMAKE_TRACE_MODE AND NOT DUMP_FORCE)
+    if(DUMP_TOFILE)
+      file(WRITE ${CMAKE_BINARY_DIR}/${DUMP_TOFILE} "Skipped due to enabled CMAKE_TRACE_MODE")
+    else()
+      message(AUTHOR_WARNING "ocv_cmake_dump_vars() is skipped due to enabled CMAKE_TRACE_MODE")
+    endif()
+    return()
+  endif()
+
   set(regex "${DUMP_UNPARSED_ARGUMENTS}")
   string(TOLOWER "${regex}" regex_lower)
   set(__VARS "")
@@ -400,6 +413,24 @@ macro(ocv_clear_vars)
   endforeach()
 endmacro()
 
+
+# Clears passed variables with INTERNAL type from CMake cache
+macro(ocv_clear_internal_cache_vars)
+  foreach(_var ${ARGN})
+    get_property(_propertySet CACHE ${_var} PROPERTY TYPE SET)
+    if(_propertySet)
+      get_property(_type CACHE ${_var} PROPERTY TYPE)
+      if(_type STREQUAL "INTERNAL")
+        message("Cleaning INTERNAL cached variable: ${_var}")
+        unset(${_var} CACHE)
+      endif()
+    endif()
+  endforeach()
+  unset(_propertySet)
+  unset(_type)
+endmacro()
+
+
 set(OCV_COMPILER_FAIL_REGEX
     "argument .* is not valid"                  # GCC 9+ (including support of unicode quotes)
     "command[- ]line option .* is valid for .* but not for C\\+\\+" # GNU
@@ -533,7 +564,11 @@ macro(ocv_check_flag_support lang flag varname base_options)
   elseif("_${lang}_" MATCHES "_C_")
     set(_lang C)
   elseif("_${lang}_" MATCHES "_OBJCXX_")
-    set(_lang OBJCXX)
+    if(DEFINED CMAKE_OBJCXX_COMPILER)  # CMake 3.16+ and enable_language(OBJCXX) call are required
+      set(_lang OBJCXX)
+    else()
+      set(_lang CXX)
+    endif()
   else()
     set(_lang ${lang})
   endif()
@@ -542,7 +577,9 @@ macro(ocv_check_flag_support lang flag varname base_options)
   string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}")
   string(REGEX REPLACE " -|-|=| |\\.|," "_" ${varname} "${${varname}}")
 
-  ocv_check_compiler_flag("${_lang}" "${base_options} ${flag}" ${${varname}} ${ARGN})
+  if(DEFINED CMAKE_${_lang}_COMPILER)
+    ocv_check_compiler_flag("${_lang}" "${base_options} ${flag}" ${${varname}} ${ARGN})
+  endif()
 endmacro()
 
 macro(ocv_check_runtime_flag flag result)
@@ -1540,6 +1577,30 @@ function(ocv_add_library target)
 endfunction()
 
 
+function(ocv_add_external_target name inc link def)
+  if(BUILD_SHARED_LIBS)
+    set(imp IMPORTED)
+  endif()
+  add_library(ocv.3rdparty.${name} INTERFACE ${imp})
+  set_target_properties(ocv.3rdparty.${name} PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${inc}"
+    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${inc}"
+    INTERFACE_COMPILE_DEFINITIONS "${def}")
+  # When cmake version is greater than or equal to 3.11, INTERFACE_LINK_LIBRARIES no longer applies to interface library
+  # See https://github.com/opencv/opencv/pull/18658
+  if (CMAKE_VERSION VERSION_LESS 3.11)
+    set_target_properties(ocv.3rdparty.${name} PROPERTIES
+      INTERFACE_LINK_LIBRARIES "${link}")
+  else()
+    target_link_libraries(ocv.3rdparty.${name} INTERFACE ${link})
+  endif()
+  #
+  if(NOT BUILD_SHARED_LIBS)
+    install(TARGETS ocv.3rdparty.${name} EXPORT OpenCVModules)
+  endif()
+endfunction()
+
+
 macro(ocv_get_libname var_name)
   get_filename_component(__libname "${ARGN}" NAME)
   # libopencv_core.so.3.3 -> opencv_core
diff --git a/cmake/android/android_gradle_projects.cmake b/cmake/android/android_gradle_projects.cmake
index c595bee107..2e34a20d97 100644
--- a/cmake/android/android_gradle_projects.cmake
+++ b/cmake/android/android_gradle_projects.cmake
@@ -1,9 +1,16 @@
 # https://developer.android.com/studio/releases/gradle-plugin
-set(ANDROID_GRADLE_PLUGIN_VERSION "3.2.1" CACHE STRING "Android Gradle Plugin version (3.0+)")
+set(ANDROID_GRADLE_PLUGIN_VERSION "3.2.1" CACHE STRING "Android Gradle Plugin version")
 message(STATUS "Android Gradle Plugin version: ${ANDROID_GRADLE_PLUGIN_VERSION}")
 
+set(GRADLE_VERSION "5.6.4" CACHE STRING "Gradle version")
+message(STATUS "Gradle version: ${GRADLE_VERSION}")
+
 set(ANDROID_COMPILE_SDK_VERSION "26" CACHE STRING "Android compileSdkVersion")
-set(ANDROID_MIN_SDK_VERSION "21" CACHE STRING "Android minSdkVersion")
+if(ANDROID_NATIVE_API_LEVEL GREATER 21)
+  set(ANDROID_MIN_SDK_VERSION "${ANDROID_NATIVE_API_LEVEL}" CACHE STRING "Android minSdkVersion")
+else()
+  set(ANDROID_MIN_SDK_VERSION "21" CACHE STRING "Android minSdkVersion")
+endif()
 set(ANDROID_TARGET_SDK_VERSION "26" CACHE STRING "Android minSdkVersion")
 
 set(ANDROID_BUILD_BASE_DIR "${OpenCV_BINARY_DIR}/opencv_android" CACHE INTERNAL "")
@@ -38,9 +45,11 @@ set(ANDROID_ABI_FILTER "${ANDROID_INSTALL_ABI_FILTER}")
 configure_file("${OpenCV_SOURCE_DIR}/samples/android/build.gradle.in" "${ANDROID_TMP_INSTALL_BASE_DIR}/${ANDROID_INSTALL_SAMPLES_DIR}/build.gradle" @ONLY)
 install(FILES "${ANDROID_TMP_INSTALL_BASE_DIR}/${ANDROID_INSTALL_SAMPLES_DIR}/build.gradle" DESTINATION "${ANDROID_INSTALL_SAMPLES_DIR}" COMPONENT samples)
 
+configure_file("${OpenCV_SOURCE_DIR}/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties.in" "${ANDROID_BUILD_BASE_DIR}/gradle/wrapper/gradle-wrapper.properties" @ONLY)
+install(FILES "${ANDROID_BUILD_BASE_DIR}/gradle/wrapper/gradle-wrapper.properties" DESTINATION "${ANDROID_INSTALL_SAMPLES_DIR}/gradle/wrapper" COMPONENT samples)
+
 set(GRADLE_WRAPPER_FILES
     "gradle/wrapper/gradle-wrapper.jar"
-    "gradle/wrapper/gradle-wrapper.properties"
     "gradlew.bat"
     "gradlew"
     "gradle.properties"
diff --git a/cmake/platforms/OpenCV-Emscripten.cmake b/cmake/platforms/OpenCV-Emscripten.cmake
new file mode 100644
index 0000000000..ec15fba799
--- /dev/null
+++ b/cmake/platforms/OpenCV-Emscripten.cmake
@@ -0,0 +1 @@
+set(OPENCV_SKIP_LINK_AS_NEEDED 1)
diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in
index 711c4e99ee..c3a39d6dfe 100644
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@@ -26,7 +26,9 @@
     opencv2/core/hal/*.impl.*
     opencv2/core/cuda*
     opencv2/core/opencl*
+    opencv2/core/parallel/backend/*
     opencv2/core/private*
+    opencv2/core/*quaternion*
     opencv/cxeigen.hpp
     opencv2/core/eigen.hpp
     opencv2/flann/hdf5.h
diff --git a/cmake/templates/xcode-launch-c.in b/cmake/templates/xcode-launch-c.in
new file mode 100644
index 0000000000..609dbf47b1
--- /dev/null
+++ b/cmake/templates/xcode-launch-c.in
@@ -0,0 +1,11 @@
+#!/bin/sh
+# https://crascit.com/2016/04/09/using-ccache-with-cmake/
+
+# Xcode generator doesn't include the compiler as the
+# first argument, Ninja and Makefiles do. Handle both cases.
+if [[ "$1" = "${CMAKE_C_COMPILER}" ]] ; then
+    shift
+fi
+
+export CCACHE_CPP2=true
+exec "${CCACHE_PROGRAM}" "${CMAKE_C_COMPILER}" "$@"
diff --git a/cmake/templates/xcode-launch-cxx.in b/cmake/templates/xcode-launch-cxx.in
new file mode 100644
index 0000000000..09233b3859
--- /dev/null
+++ b/cmake/templates/xcode-launch-cxx.in
@@ -0,0 +1,11 @@
+#!/bin/sh
+# https://crascit.com/2016/04/09/using-ccache-with-cmake/
+
+# Xcode generator doesn't include the compiler as the
+# first argument, Ninja and Makefiles do. Handle both cases.
+if [[ "$1" = "${CMAKE_CXX_COMPILER}" ]] ; then
+    shift
+fi
+
+export CCACHE_CPP2=true
+exec "${CCACHE_PROGRAM}" "${CMAKE_CXX_COMPILER}" "$@"
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index ec7d7cd3b3..a321be9878 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -39,7 +39,6 @@ ALIASES               += end_toggle="@htmlonly[block] </div> @endhtmlonly"
 ALIASES               += prev_tutorial{1}="**Prev  Tutorial:** \ref \1 \n"
 ALIASES               += next_tutorial{1}="**Next  Tutorial:** \ref \1 \n"
 ALIASES               += youtube{1}="@htmlonly[block]<div align='center'><iframe title='Video' width='560' height='349' src='https://www.youtube.com/embed/\1?rel=0' frameborder='0' align='middle' allowfullscreen></iframe></div>@endhtmlonly"
-TCL_SUBST              =
 OPTIMIZE_OUTPUT_FOR_C  = NO
 OPTIMIZE_OUTPUT_JAVA   = NO
 OPTIMIZE_FOR_FORTRAN   = NO
@@ -228,6 +227,7 @@ INCLUDE_PATH           =
 INCLUDE_FILE_PATTERNS  =
 PREDEFINED             = __cplusplus=1 \
                          CVAPI(x)=x \
+                         CV_API_CALL= \
                          CV_DOXYGEN= \
                          CV_EXPORTS= \
                          CV_EXPORTS_W= \
@@ -255,6 +255,12 @@ PREDEFINED             = __cplusplus=1 \
                          CV_DEFAULT(x)=" = x" \
                          CV_NEON=1 \
                          CV_SSE2=1 \
+                         CV_SIMD128=1 \
+                         CV_SIMD256=1 \
+                         CV_SIMD512=1 \
+                         CV_SIMD128_64F=1 \
+                         CV_SIMD256_64F=1 \
+                         CV_SIMD512_64F=1 \
                          CV__DEBUG_NS_BEGIN= \
                          CV__DEBUG_NS_END= \
                          CV_DEPRECATED_EXTERNAL= \
diff --git a/doc/js_tutorials/js_assets/js_dnn_example_helper.js b/doc/js_tutorials/js_assets/js_dnn_example_helper.js
new file mode 100644
index 0000000000..06baa6760b
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_dnn_example_helper.js
@@ -0,0 +1,119 @@
+getBlobFromImage = function(inputSize, mean, std, swapRB, image) {
+    let mat;
+    if (typeof(image) === 'string') {
+        mat = cv.imread(image);
+    } else {
+        mat = image;
+    }
+
+    let matC3 = new cv.Mat(mat.matSize[0], mat.matSize[1], cv.CV_8UC3);
+    cv.cvtColor(mat, matC3, cv.COLOR_RGBA2BGR);
+    let input = cv.blobFromImage(matC3, std, new cv.Size(inputSize[0], inputSize[1]),
+                                 new cv.Scalar(mean[0], mean[1], mean[2]), swapRB);
+
+    matC3.delete();
+    return input;
+}
+
+loadLables = async function(labelsUrl) {
+    let response = await fetch(labelsUrl);
+    let label = await response.text();
+    label = label.split('\n');
+    return label;
+}
+
+loadModel = async function(e) {
+    return new Promise((resolve) => {
+        let file = e.target.files[0];
+        let path = file.name;
+        let reader = new FileReader();
+        reader.readAsArrayBuffer(file);
+        reader.onload = function(ev) {
+            if (reader.readyState === 2) {
+                let buffer = reader.result;
+                let data = new Uint8Array(buffer);
+                cv.FS_createDataFile('/', path, data, true, false, false);
+                resolve(path);
+            }
+        }
+    });
+}
+
+getTopClasses = function(probs, labels, topK = 3) {
+    probs = Array.from(probs);
+    let indexes = probs.map((prob, index) => [prob, index]);
+    let sorted = indexes.sort((a, b) => {
+        if (a[0] === b[0]) {return 0;}
+        return a[0] < b[0] ? -1 : 1;
+    });
+    sorted.reverse();
+    let classes = [];
+    for (let i = 0; i < topK; ++i) {
+        let prob = sorted[i][0];
+        let index = sorted[i][1];
+        let c = {
+            label: labels[index],
+            prob: (prob * 100).toFixed(2)
+        }
+        classes.push(c);
+    }
+    return classes;
+}
+
+loadImageToCanvas = function(e, canvasId) {
+    let files = e.target.files;
+    let imgUrl = URL.createObjectURL(files[0]);
+    let canvas = document.getElementById(canvasId);
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = imgUrl;
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+}
+
+drawInfoTable = async function(jsonUrl, divId) {
+    let response = await fetch(jsonUrl);
+    let json = await response.json();
+
+    let appendix = document.getElementById(divId);
+    for (key of Object.keys(json)) {
+        let h3 = document.createElement('h3');
+        h3.textContent = key + " model";
+        appendix.appendChild(h3);
+
+        let table = document.createElement('table');
+        let head_tr = document.createElement('tr');
+        for (head of Object.keys(json[key][0])) {
+            let th = document.createElement('th');
+            th.textContent = head;
+            th.style.border = "1px solid black";
+            head_tr.appendChild(th);
+        }
+        table.appendChild(head_tr)
+
+        for (model of json[key]) {
+            let tr = document.createElement('tr');
+            for (params of Object.keys(model)) {
+                let td = document.createElement('td');
+                td.style.border = "1px solid black";
+                if (params !== "modelUrl" && params !== "configUrl" && params !== "labelsUrl") {
+                    td.textContent = model[params];
+                    tr.appendChild(td);
+                } else {
+                    let a = document.createElement('a');
+                    let link = document.createTextNode('link');
+                    a.append(link);
+                    a.href = model[params];
+                    td.appendChild(a);
+                    tr.appendChild(td);
+                }
+            }
+            table.appendChild(tr);
+        }
+        table.style.width = "800px";
+        table.style.borderCollapse = "collapse";
+        appendix.appendChild(table);
+    }
+}
diff --git a/doc/js_tutorials/js_assets/js_image_classification.html b/doc/js_tutorials/js_assets/js_image_classification.html
new file mode 100644
index 0000000000..656f2720b6
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_image_classification.html
@@ -0,0 +1,263 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Image Classification Example</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Image Classification Example</h2>
+<p>
+    This tutorial shows you how to write an image classification example with OpenCV.js.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configFile</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Try it</b> button to see the result. You can choose any other images.<br>
+</p>
+
+<div class="control"><button id="tryIt" disabled>Try it</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <canvas id="canvasInput" width="400" height="400"></canvas>
+            </td>
+            <td>
+                <table style="visibility: hidden;" id="result">
+                    <thead>
+                        <tr>
+                            <th scope="col">#</th>
+                            <th scope="col" width=300>Label</th>
+                            <th scope="col">Probability</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <tr>
+                            <th scope="row">1</th>
+                            <td id="label0" align="center"></td>
+                            <td id="prob0" align="center"></td>
+                        </tr>
+                        <tr>
+                            <th scope="row">2</th>
+                            <td id="label1" align="center"></td>
+                            <td id="prob1" align="center"></td>
+                        </tr>
+                        <tr>
+                            <th scope="row">3</th>
+                            <td id="label2" align="center"></td>
+                            <td id="prob2" align="center"></td>
+                        </tr>
+                    </tbody>
+                </table>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    canvasInput <input type="file" id="fileInput" name="file" accept="image/*">
+                </div>
+            </td>
+            <td></td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="13" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.Main loop in which will read the image from canvas and do inference once.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Load labels from txt file and process it into an array.</p>
+    <textarea class="code" rows="7" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+    <p>6.The post-processing, including softmax if needed and get the top classes from the output vector.</p>
+    <textarea class="code" rows="35" cols="100" id="codeEditor5" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [224,224];
+mean = [104, 117, 123];
+std = 1;
+swapRB = false;
+
+// record if need softmax function for post-processing
+needSoftmax = false;
+
+// url for label file, can from local or Internet
+labelsUrl = "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt";
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+main = async function() {
+    const labels = await loadLables(labelsUrl);
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, 'canvasInput');
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const probs = softmax(result);
+    const classes = getTopClasses(probs, labels);
+
+    updateResult(classes, time);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+</script>
+
+<script id="codeSnippet5" type="text/code-snippet">
+softmax = function(result) {
+    let arr = result.data32F;
+    if (needSoftmax) {
+        const maxNum = Math.max(...arr);
+        const expSum = arr.map((num) => Math.exp(num - maxNum)).reduce((a, b) => a + b);
+        return arr.map((value, index) => {
+            return Math.exp(value - maxNum) / expSum;
+        });
+    } else {
+        return arr;
+    }
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_image_classification_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let loadLablesCode = 'loadLables = ' + loadLables.toString();
+    document.getElementById('codeEditor2').value = loadLablesCode;
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor3').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor4').value = loadModelCode;
+
+    utils.loadCode('codeSnippet5', 'codeEditor5');
+    let getTopClassesCode = 'getTopClasses = ' + getTopClasses.toString();
+    document.getElementById('codeEditor5').value  += '\n' + '\n' + getTopClassesCode;
+
+    let canvas = document.getElementById('canvasInput');
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = 'space_shuttle.jpg';
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+
+    let tryIt = document.getElementById('tryIt');
+    tryIt.addEventListener('click', () => {
+        initStatus();
+        document.getElementById('status').innerHTML = 'Running function main()...';
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+        if (modelPath === "") {
+            document.getElementById('status').innerHTML = 'Runing failed.';
+            utils.printError('Please upload model file by clicking the button first.');
+        } else {
+            setTimeout(main, 1);
+        }
+    });
+
+    let fileInput = document.getElementById('fileInput');
+    fileInput.addEventListener('change', (e) => {
+        initStatus();
+        loadImageToCanvas(e, 'canvasInput');
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        tryIt.removeAttribute('disabled');
+    });
+
+    var main = async function() {};
+    var softmax = function(result){};
+    var getTopClasses = function(mat, labels, topK = 3){};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+    utils.executeCode('codeEditor5');
+
+    function updateResult(classes, time) {
+        try{
+            classes.forEach((c,i) => {
+                let labelElement = document.getElementById('label'+i);
+                let probElement = document.getElementById('prob'+i);
+                labelElement.innerHTML = c.label;
+                probElement.innerHTML = c.prob + '%';
+            });
+            let result = document.getElementById('result');
+            result.style.visibility = 'visible';
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                         <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('result').style.visibility = 'hidden';
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_image_classification_model_info.json b/doc/js_tutorials/js_assets/js_image_classification_model_info.json
new file mode 100644
index 0000000000..67553ec2d3
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_image_classification_model_info.json
@@ -0,0 +1,65 @@
+{
+    "caffe": [
+        {
+            "model": "alexnet",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "needSoftmax": "false",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt",
+            "modelUrl": "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_alexnet/deploy.prototxt"
+        },
+        {
+            "model": "densenet",
+            "mean": "127.5, 127.5, 127.5",
+            "std": "0.007843",
+            "swapRB": "false",
+            "needSoftmax": "true",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt",
+            "modelUrl": "https://drive.google.com/open?id=0B7ubpZO7HnlCcHlfNmJkU2VPelE",
+            "configUrl": "https://raw.githubusercontent.com/shicai/DenseNet-Caffe/master/DenseNet_121.prototxt"
+        },
+        {
+            "model": "googlenet",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "needSoftmax": "false",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt",
+            "modelUrl": "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_googlenet/deploy.prototxt"
+        },
+        {
+            "model": "squeezenet",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "needSoftmax": "false",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt",
+            "modelUrl": "https://raw.githubusercontent.com/forresti/SqueezeNet/master/SqueezeNet_v1.0/squeezenet_v1.0.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/forresti/SqueezeNet/master/SqueezeNet_v1.0/deploy.prototxt"
+        },
+        {
+            "model": "VGG",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "needSoftmax": "false",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt",
+            "modelUrl": "http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel",
+            "configUrl": "https://gist.githubusercontent.com/ksimonyan/3785162f95cd2d5fee77/raw/f02f8769e64494bcd3d7e97d5d747ac275825721/VGG_ILSVRC_19_layers_deploy.prototxt"
+        }
+    ],
+    "tensorflow": [
+        {
+            "model": "inception",
+            "mean": "123, 117, 104",
+            "std": "1",
+            "swapRB": "true",
+            "needSoftmax": "false",
+            "labelsUrl": "https://raw.githubusercontent.com/petewarden/tf_ios_makefile_example/master/data/imagenet_comp_graph_label_strings.txt",
+            "modelUrl": "https://raw.githubusercontent.com/petewarden/tf_ios_makefile_example/master/data/tensorflow_inception_graph.pb"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_image_classification_with_camera.html b/doc/js_tutorials/js_assets/js_image_classification_with_camera.html
new file mode 100644
index 0000000000..9a2473cf2b
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_image_classification_with_camera.html
@@ -0,0 +1,281 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Image Classification Example with Camera</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Image Classification Example with Camera</h2>
+<p>
+    This tutorial shows you how to write an image classification example with camera.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configFile</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Start/Stop</b> button to start or stop the camera capture.<br>
+</p>
+
+<div class="control"><button id="startAndStop" disabled>Start</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <video id="videoInput" width="400" height="400"></video>
+            </td>
+            <td>
+                <table style="visibility: hidden;" id="result">
+                    <thead>
+                        <tr>
+                            <th scope="col">#</th>
+                            <th scope="col" width=300>Label</th>
+                            <th scope="col">Probability</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <tr>
+                            <th scope="row">1</th>
+                            <td id="label0" align="center"></td>
+                            <td id="prob0" align="center"></td>
+                        </tr>
+                        <tr>
+                            <th scope="row">2</th>
+                            <td id="label1" align="center"></td>
+                            <td id="prob1" align="center"></td>
+                        </tr>
+                        <tr>
+                            <th scope="row">3</th>
+                            <td id="label2" align="center"></td>
+                            <td id="prob2" align="center"></td>
+                        </tr>
+                    </tbody>
+                </table>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    videoInput
+                </div>
+            </td>
+            <td></td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="13" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.The function to capture video from camera, and the main loop in which will do inference once.</p>
+    <textarea class="code" rows="35" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Load labels from txt file and process it into an array.</p>
+    <textarea class="code" rows="7" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+    <p>6.The post-processing, including softmax if needed and get the top classes from the output vector.</p>
+    <textarea class="code" rows="35" cols="100" id="codeEditor5" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [224,224];
+mean = [104, 117, 123];
+std = 1;
+swapRB = false;
+
+// record if need softmax function for post-processing
+needSoftmax = false;
+
+// url for label file, can from local or Internet
+labelsUrl = "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/classification_classes_ILSVRC2012.txt";
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+let frame = new cv.Mat(video.height, video.width, cv.CV_8UC4);
+let cap = new cv.VideoCapture(video);
+
+main = async function(frame) {
+    const labels = await loadLables(labelsUrl);
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, frame);
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const probs = softmax(result);
+    const classes = getTopClasses(probs, labels);
+
+    updateResult(classes, time);
+    setTimeout(processVideo, 0);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+
+function processVideo() {
+    try {
+        if (!streaming) {
+            return;
+        }
+        cap.read(frame);
+        main(frame);
+    } catch (err) {
+        utils.printError(err);
+    }
+}
+
+setTimeout(processVideo, 0);
+</script>
+
+<script id="codeSnippet5" type="text/code-snippet">
+softmax = function(result) {
+    let arr = result.data32F;
+    if (needSoftmax) {
+        const maxNum = Math.max(...arr);
+        const expSum = arr.map((num) => Math.exp(num - maxNum)).reduce((a, b) => a + b);
+        return arr.map((value, index) => {
+            return Math.exp(value - maxNum) / expSum;
+        });
+    } else {
+        return arr;
+    }
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_image_classification_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let loadLablesCode = 'loadLables = ' + loadLables.toString();
+    document.getElementById('codeEditor2').value = loadLablesCode;
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor3').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor4').value = loadModelCode;
+
+    utils.loadCode('codeSnippet5', 'codeEditor5');
+    let getTopClassesCode = 'getTopClasses = ' + getTopClasses.toString();
+    document.getElementById('codeEditor5').value  += '\n' + '\n' + getTopClassesCode;
+
+    let video = document.getElementById('videoInput');
+    let streaming = false;
+    let startAndStop = document.getElementById('startAndStop');
+    startAndStop.addEventListener('click', () => {
+        if (!streaming) {
+            utils.clearError();
+            utils.startCamera('qvga', onVideoStarted, 'videoInput');
+        } else {
+            utils.stopCamera();
+            onVideoStopped();
+        }
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        startAndStop.removeAttribute('disabled');
+
+    });
+
+    var main = async function(frame) {};
+    var softmax = function(result){};
+    var getTopClasses = function(mat, labels, topK = 3){};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+    utils.executeCode('codeEditor5');
+
+    function onVideoStarted() {
+        streaming = true;
+        startAndStop.innerText = 'Stop';
+        videoInput.width = videoInput.videoWidth;
+        videoInput.height = videoInput.videoHeight;
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+    }
+
+    function onVideoStopped() {
+        streaming = false;
+        startAndStop.innerText = 'Start';
+        initStatus();
+    }
+
+    function updateResult(classes, time) {
+        try{
+            classes.forEach((c,i) => {
+                let labelElement = document.getElementById('label'+i);
+                let probElement = document.getElementById('prob'+i);
+                labelElement.innerHTML = c.label;
+                probElement.innerHTML = c.prob + '%';
+            });
+            let result = document.getElementById('result');
+            result.style.visibility = 'visible';
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                         <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('result').style.visibility = 'hidden';
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_intelligent_scissors.html b/doc/js_tutorials/js_assets/js_intelligent_scissors.html
new file mode 100644
index 0000000000..1782dc6f03
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_intelligent_scissors.html
@@ -0,0 +1,127 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>Intelligent Scissors Example</title>
+<link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<h2>Intelligent Scissors Example</h2>
+<p>
+    Click <b>Start</b> button to launch the code below.<br>
+    Then click on image to pick source point. After that you can hover mouse pointer over canvas to specify target point candidate.<br>
+    You can change the code in the &lt;textarea&gt; to investigate more. You can choose another image (need to "Stop" first).
+</p>
+<div>
+<div class="control"><button id="tryIt" disabled>Start</button> <button id="stopIt" disabled>Stop</button></div>
+<textarea class="code" rows="20" cols="100" id="codeEditor" spellcheck="false">
+</textarea>
+<p class="err" id="errorMessage"></p>
+</div>
+<div id="inputParams">
+  <div class="caption">canvasInput <input type="file" id="fileInput" name="file" accept="image/*" /></div>
+  <canvas id="canvasInput"></canvas>
+</div>
+<div id="result" style="display:none">
+  <canvas id="canvasOutput"></canvas>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+let src = cv.imread('canvasInput');
+//cv.resize(src, src, new cv.Size(1024, 1024));
+cv.imshow('canvasOutput', src);
+
+let tool = new cv.segmentation_IntelligentScissorsMB();
+tool.setEdgeFeatureCannyParameters(32, 100);
+tool.setGradientMagnitudeMaxLimit(200);
+tool.applyImage(src);
+
+let hasMap = false;
+
+let canvas = document.getElementById('canvasOutput');
+canvas.addEventListener('click', e => {
+    let startX = e.offsetX, startY = e.offsetY; console.log(startX, startY);
+    if (startX < src.cols && startY < src.rows)
+    {
+        console.time('buildMap');
+        tool.buildMap(new cv.Point(startX, startY));
+        console.timeEnd('buildMap');
+        hasMap = true;
+    }
+});
+canvas.addEventListener('mousemove', e => {
+    let x = e.offsetX, y = e.offsetY; //console.log(x, y);
+    let dst = src.clone();
+    if (hasMap && x >= 0 && x < src.cols && y >= 0 && y < src.rows)
+    {
+        let contour = new cv.Mat();
+        tool.getContour(new cv.Point(x, y), contour);
+        let contours = new cv.MatVector();
+        contours.push_back(contour);
+        let color = new cv.Scalar(0, 255, 0, 255);  // RGBA
+        cv.polylines(dst, contours, false, color, 1, cv.LINE_8);
+        contours.delete(); contour.delete();
+    }
+    cv.imshow('canvasOutput', dst);
+    dst.delete();
+});
+canvas.addEventListener('dispose', e => {
+    src.delete();
+    tool.delete();
+});
+</script>
+
+<script type="text/javascript">
+let utils = new Utils('errorMessage');
+
+utils.loadCode('codeSnippet', 'codeEditor');
+utils.loadImageToCanvas('lena.jpg', 'canvasInput');
+utils.addFileInputHandler('fileInput', 'canvasInput');
+
+let disposeEvent = new Event('dispose');
+
+let tryIt = document.getElementById('tryIt');
+let stopIt = document.getElementById('stopIt');
+
+tryIt.addEventListener('click', () => {
+    let e_input = document.getElementById('inputParams');
+    e_input.style.display = 'none';
+
+    let e_result = document.getElementById("result")
+    e_result.style.display = '';
+
+    var e = document.getElementById("canvasOutput");
+    var e_new = e.cloneNode(true);
+    e.parentNode.replaceChild(e_new, e);  // reset event handlers
+
+    stopIt.removeAttribute('disabled');
+    tryIt.setAttribute('disabled', '');
+
+    utils.executeCode('codeEditor');
+});
+
+stopIt.addEventListener('click', () => {
+    let e_input = document.getElementById('inputParams');
+    e_input.style.display = '';
+
+    let e_result = document.getElementById("result")
+    e_result.style.display = 'none';
+
+    var e = document.getElementById("canvasOutput");
+    e.dispatchEvent(disposeEvent);
+
+    var e_new = e.cloneNode(true);
+    e.parentNode.replaceChild(e_new, e);  // reset event handlers
+
+    tryIt.removeAttribute('disabled');
+    stopIt.setAttribute('disabled', '');
+});
+
+utils.loadOpenCv(() => {
+    tryIt.removeAttribute('disabled');
+});
+</script>
+</body>
+</html>
diff --git a/doc/js_tutorials/js_assets/js_object_detection.html b/doc/js_tutorials/js_assets/js_object_detection.html
new file mode 100644
index 0000000000..53f1e48639
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_object_detection.html
@@ -0,0 +1,387 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Object Detection Example</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Object Detection Example</h2>
+<p>
+    This tutorial shows you how to write an object detection example with OpenCV.js.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configFile</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Try it</b> button to see the result. You can choose any other images.<br>
+</p>
+
+<div class="control"><button id="tryIt" disabled>Try it</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <canvas id="canvasInput" width="400" height="400"></canvas>
+            </td>
+            <td>
+                <canvas id="canvasOutput" style="visibility: hidden;" width="400" height="400"></canvas>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    canvasInput <input type="file" id="fileInput" name="file" accept="image/*">
+                </div>
+            </td>
+            <td>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile" name="file">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="15" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.Main loop in which will read the image from canvas and do inference once.</p>
+    <textarea class="code" rows="16" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Load labels from txt file and process it into an array.</p>
+    <textarea class="code" rows="7" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+    <p>6.The post-processing, including get boxes from output and draw boxes into the image.</p>
+    <textarea class="code" rows="35" cols="100" id="codeEditor5" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [300, 300];
+mean = [127.5, 127.5, 127.5];
+std = 0.007843;
+swapRB = false;
+confThreshold = 0.5;
+nmsThreshold = 0.4;
+
+// The type of output, can be YOLO or SSD
+outType = "SSD";
+
+// url for label file, can from local or Internet
+labelsUrl = "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/object_detection_classes_pascal_voc.txt";
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+main = async function() {
+    const labels = await loadLables(labelsUrl);
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, 'canvasInput');
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const output = postProcess(result, labels);
+
+    updateResult(output, time);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+</script>
+
+<script id="codeSnippet5" type="text/code-snippet">
+postProcess = function(result, labels) {
+    let canvasOutput = document.getElementById('canvasOutput');
+    const outputWidth = canvasOutput.width;
+    const outputHeight = canvasOutput.height;
+    const resultData = result.data32F;
+
+    // Get the boxes(with class and confidence) from the output
+    let boxes = [];
+    switch(outType) {
+        case "YOLO": {
+            const vecNum = result.matSize[0];
+            const vecLength = result.matSize[1];
+            const classNum = vecLength - 5;
+
+            for (let i = 0; i < vecNum; ++i) {
+                let vector = resultData.slice(i*vecLength, (i+1)*vecLength);
+                let scores = vector.slice(5, vecLength);
+                let classId = scores.indexOf(Math.max(...scores));
+                let confidence = scores[classId];
+                if (confidence > confThreshold) {
+                    let center_x = Math.round(vector[0] * outputWidth);
+                    let center_y = Math.round(vector[1] * outputHeight);
+                    let width = Math.round(vector[2] * outputWidth);
+                    let height = Math.round(vector[3] * outputHeight);
+                    let left = Math.round(center_x - width / 2);
+                    let top = Math.round(center_y - height / 2);
+
+                    let box = {
+                        scores: scores,
+                        classId: classId,
+                        confidence: confidence,
+                        bounding: [left, top, width, height],
+                        toDraw: true
+                    }
+                    boxes.push(box);
+                }
+            }
+
+            // NMS(Non Maximum Suppression) algorithm
+            let boxNum = boxes.length;
+            let tmp_boxes = [];
+            let sorted_boxes = [];
+            for (let c = 0; c < classNum; ++c) {
+                for (let i = 0; i < boxes.length; ++i) {
+                    tmp_boxes[i] = [boxes[i], i];
+                }
+                sorted_boxes = tmp_boxes.sort((a, b) => { return (b[0].scores[c] - a[0].scores[c]); });
+                for (let i = 0; i < boxNum; ++i) {
+                    if (sorted_boxes[i][0].scores[c] === 0) continue;
+                    else {
+                        for (let j = i + 1; j < boxNum; ++j) {
+                            if (IOU(sorted_boxes[i][0], sorted_boxes[j][0]) >= nmsThreshold) {
+                                boxes[sorted_boxes[j][1]].toDraw = false;
+                            }
+                        }
+                    }
+                }
+            }
+        } break;
+        case "SSD": {
+            const vecNum = result.matSize[2];
+            const vecLength = 7;
+
+            for (let i = 0; i < vecNum; ++i) {
+                let vector = resultData.slice(i*vecLength, (i+1)*vecLength);
+                let confidence = vector[2];
+                if (confidence > confThreshold) {
+                    let left, top, right, bottom, width, height;
+                    left = Math.round(vector[3]);
+                    top = Math.round(vector[4]);
+                    right = Math.round(vector[5]);
+                    bottom = Math.round(vector[6]);
+                    width = right - left + 1;
+                    height = bottom - top + 1;
+                    if (width <= 2 || height <= 2) {
+                        left = Math.round(vector[3] * outputWidth);
+                        top = Math.round(vector[4] * outputHeight);
+                        right = Math.round(vector[5] * outputWidth);
+                        bottom = Math.round(vector[6] * outputHeight);
+                        width = right - left + 1;
+                        height = bottom - top + 1;
+                    }
+                    let box = {
+                        classId: vector[1] - 1,
+                        confidence: confidence,
+                        bounding: [left, top, width, height],
+                        toDraw: true
+                    }
+                    boxes.push(box);
+                }
+            }
+        } break;
+        default:
+            console.error(`Unsupported output type ${outType}`)
+    }
+
+    // Draw the saved box into the image
+    let image = cv.imread("canvasInput");
+    let output = new cv.Mat(outputWidth, outputHeight, cv.CV_8UC3);
+    cv.cvtColor(image, output, cv.COLOR_RGBA2RGB);
+    let boxNum = boxes.length;
+    for (let i = 0; i < boxNum; ++i) {
+        if (boxes[i].toDraw) {
+            drawBox(boxes[i]);
+        }
+    }
+
+    return output;
+
+
+    // Calculate the IOU(Intersection over Union) of two boxes
+    function IOU(box1, box2) {
+        let bounding1 = box1.bounding;
+        let bounding2 = box2.bounding;
+        let s1 = bounding1[2] * bounding1[3];
+        let s2 = bounding2[2] * bounding2[3];
+
+        let left1 = bounding1[0];
+        let right1 = left1 + bounding1[2];
+        let left2 = bounding2[0];
+        let right2 = left2 + bounding2[2];
+        let overlapW = calOverlap([left1, right1], [left2, right2]);
+
+        let top1 = bounding2[1];
+        let bottom1 = top1 + bounding1[3];
+        let top2 = bounding2[1];
+        let bottom2 = top2 + bounding2[3];
+        let overlapH = calOverlap([top1, bottom1], [top2, bottom2]);
+
+        let overlapS = overlapW * overlapH;
+        return overlapS / (s1 + s2 + overlapS);
+    }
+
+    // Calculate the overlap range of two vector
+    function calOverlap(range1, range2) {
+        let min1 = range1[0];
+        let max1 = range1[1];
+        let min2 = range2[0];
+        let max2 = range2[1];
+
+        if (min2 > min1 && min2 < max1) {
+            return max1 - min2;
+        } else if (max2 > min1 && max2 < max1) {
+            return max2 - min1;
+        } else {
+            return 0;
+        }
+    }
+
+    // Draw one predict box into the origin image
+    function drawBox(box) {
+        let bounding = box.bounding;
+        let left = bounding[0];
+        let top = bounding[1];
+        let width = bounding[2];
+        let height = bounding[3];
+
+        cv.rectangle(output, new cv.Point(left, top), new cv.Point(left + width, top + height),
+                             new cv.Scalar(0, 255, 0));
+        cv.rectangle(output, new cv.Point(left, top), new cv.Point(left + width, top + 15),
+                             new cv.Scalar(255, 255, 255), cv.FILLED);
+        let text = `${labels[box.classId]}: ${box.confidence.toFixed(4)}`;
+        cv.putText(output, text, new cv.Point(left, top + 10), cv.FONT_HERSHEY_SIMPLEX, 0.3,
+                                 new cv.Scalar(0, 0, 0));
+    }
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_object_detection_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let loadLablesCode = 'loadLables = ' + loadLables.toString();
+    document.getElementById('codeEditor2').value = loadLablesCode;
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor3').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor4').value = loadModelCode;
+
+    utils.loadCode('codeSnippet5', 'codeEditor5');
+
+    let canvas = document.getElementById('canvasInput');
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = 'lena.png';
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+
+    let tryIt = document.getElementById('tryIt');
+    tryIt.addEventListener('click', () => {
+        initStatus();
+        document.getElementById('status').innerHTML = 'Running function main()...';
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+        if (modelPath === "") {
+            document.getElementById('status').innerHTML = 'Runing failed.';
+            utils.printError('Please upload model file by clicking the button first.');
+        } else {
+            setTimeout(main, 1);
+        }
+    });
+
+    let fileInput = document.getElementById('fileInput');
+    fileInput.addEventListener('change', (e) => {
+        initStatus();
+        loadImageToCanvas(e, 'canvasInput');
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        tryIt.removeAttribute('disabled');
+    });
+
+    var main = async function() {};
+    var postProcess = function(result, labels) {};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+    utils.executeCode('codeEditor5');
+
+
+    function updateResult(output, time) {
+        try{
+            let canvasOutput = document.getElementById('canvasOutput');
+            canvasOutput.style.visibility = "visible";
+            cv.imshow('canvasOutput', output);
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                           <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('canvasOutput').style.visibility = "hidden";
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_object_detection_model_info.json b/doc/js_tutorials/js_assets/js_object_detection_model_info.json
new file mode 100644
index 0000000000..c0d14be714
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_object_detection_model_info.json
@@ -0,0 +1,39 @@
+{
+    "caffe": [
+        {
+            "model": "mobilenet_SSD",
+            "inputSize": "300, 300",
+            "mean": "127.5, 127.5, 127.5",
+            "std": "0.007843",
+            "swapRB": "false",
+            "outType": "SSD",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/object_detection_classes_pascal_voc.txt",
+            "modelUrl": "https://raw.githubusercontent.com/chuanqi305/MobileNet-SSD/master/mobilenet_iter_73000.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/chuanqi305/MobileNet-SSD/master/deploy.prototxt"
+        },
+        {
+            "model": "VGG_SSD",
+            "inputSize": "300, 300",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "outType": "SSD",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/object_detection_classes_pascal_voc.txt",
+            "modelUrl": "https://drive.google.com/uc?id=0BzKzrI_SkD1_WVVTSmQxU0dVRzA&export=download",
+            "configUrl": "https://drive.google.com/uc?id=0BzKzrI_SkD1_WVVTSmQxU0dVRzA&export=download"
+        }
+    ],
+    "darknet": [
+        {
+            "model": "yolov2_tiny",
+            "inputSize": "416, 416",
+            "mean": "0, 0, 0",
+            "std": "0.00392",
+            "swapRB": "false",
+            "outType": "YOLO",
+            "labelsUrl": "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/object_detection_classes_yolov3.txt",
+            "modelUrl": "https://pjreddie.com/media/files/yolov2-tiny.weights",
+            "configUrl": "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2-tiny.cfg"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_object_detection_with_camera.html b/doc/js_tutorials/js_assets/js_object_detection_with_camera.html
new file mode 100644
index 0000000000..41bb609708
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_object_detection_with_camera.html
@@ -0,0 +1,402 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Object Detection Example with Camera</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Object Detection Example with Camera </h2>
+<p>
+    This tutorial shows you how to write an object detection example with camera.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configInput</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Start/Stop</b> button to start or stop the camera capture.<br>
+</p>
+
+<div class="control"><button id="startAndStop" disabled>Start</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <video id="videoInput" width="400" height="400"></video>
+            </td>
+            <td>
+                <canvas id="canvasOutput" style="visibility: hidden;" width="400" height="400"></canvas>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    videoInput
+                </div>
+            </td>
+            <td>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile" name="file">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="15" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.The function to capture video from camera, and the main loop in which will do inference once.</p>
+    <textarea class="code" rows="34" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Load labels from txt file and process it into an array.</p>
+    <textarea class="code" rows="7" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+    <p>6.The post-processing, including get boxes from output and draw boxes into the image.</p>
+    <textarea class="code" rows="35" cols="100" id="codeEditor5" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [300, 300];
+mean = [127.5, 127.5, 127.5];
+std = 0.007843;
+swapRB = false;
+confThreshold = 0.5;
+nmsThreshold = 0.4;
+
+// the type of output, can be YOLO or SSD
+outType = "SSD";
+
+// url for label file, can from local or Internet
+labelsUrl = "https://raw.githubusercontent.com/opencv/opencv/master/samples/data/dnn/object_detection_classes_pascal_voc.txt";
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+let frame = new cv.Mat(videoInput.height, videoInput.width, cv.CV_8UC4);
+let cap = new cv.VideoCapture(videoInput);
+
+main = async function(frame) {
+    const labels = await loadLables(labelsUrl);
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, frame);
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const output = postProcess(result, labels, frame);
+
+    updateResult(output, time);
+    setTimeout(processVideo, 0);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+
+function processVideo() {
+    try {
+        if (!streaming) {
+            return;
+        }
+        cap.read(frame);
+        main(frame);
+    } catch (err) {
+        utils.printError(err);
+    }
+}
+
+setTimeout(processVideo, 0);
+</script>
+
+<script id="codeSnippet5" type="text/code-snippet">
+postProcess = function(result, labels, frame) {
+    let canvasOutput = document.getElementById('canvasOutput');
+    const outputWidth = canvasOutput.width;
+    const outputHeight = canvasOutput.height;
+    const resultData = result.data32F;
+
+    // Get the boxes(with class and confidence) from the output
+    let boxes = [];
+    switch(outType) {
+        case "YOLO": {
+            const vecNum = result.matSize[0];
+            const vecLength = result.matSize[1];
+            const classNum = vecLength - 5;
+
+            for (let i = 0; i < vecNum; ++i) {
+                let vector = resultData.slice(i*vecLength, (i+1)*vecLength);
+                let scores = vector.slice(5, vecLength);
+                let classId = scores.indexOf(Math.max(...scores));
+                let confidence = scores[classId];
+                if (confidence > confThreshold) {
+                    let center_x = Math.round(vector[0] * outputWidth);
+                    let center_y = Math.round(vector[1] * outputHeight);
+                    let width = Math.round(vector[2] * outputWidth);
+                    let height = Math.round(vector[3] * outputHeight);
+                    let left = Math.round(center_x - width / 2);
+                    let top = Math.round(center_y - height / 2);
+
+                    let box = {
+                        scores: scores,
+                        classId: classId,
+                        confidence: confidence,
+                        bounding: [left, top, width, height],
+                        toDraw: true
+                    }
+                    boxes.push(box);
+                }
+            }
+
+            // NMS(Non Maximum Suppression) algorithm
+            let boxNum = boxes.length;
+            let tmp_boxes = [];
+            let sorted_boxes = [];
+            for (let c = 0; c < classNum; ++c) {
+                for (let i = 0; i < boxes.length; ++i) {
+                    tmp_boxes[i] = [boxes[i], i];
+                }
+                sorted_boxes = tmp_boxes.sort((a, b) => { return (b[0].scores[c] - a[0].scores[c]); });
+                for (let i = 0; i < boxNum; ++i) {
+                    if (sorted_boxes[i][0].scores[c] === 0) continue;
+                    else {
+                        for (let j = i + 1; j < boxNum; ++j) {
+                            if (IOU(sorted_boxes[i][0], sorted_boxes[j][0]) >= nmsThreshold) {
+                                boxes[sorted_boxes[j][1]].toDraw = false;
+                            }
+                        }
+                    }
+                }
+            }
+        } break;
+        case "SSD": {
+            const vecNum = result.matSize[2];
+            const vecLength = 7;
+
+            for (let i = 0; i < vecNum; ++i) {
+                let vector = resultData.slice(i*vecLength, (i+1)*vecLength);
+                let confidence = vector[2];
+                if (confidence > confThreshold) {
+                    let left, top, right, bottom, width, height;
+                    left = Math.round(vector[3]);
+                    top = Math.round(vector[4]);
+                    right = Math.round(vector[5]);
+                    bottom = Math.round(vector[6]);
+                    width = right - left + 1;
+                    height = bottom - top + 1;
+                    if (width <= 2 || height <= 2) {
+                        left = Math.round(vector[3] * outputWidth);
+                        top = Math.round(vector[4] * outputHeight);
+                        right = Math.round(vector[5] * outputWidth);
+                        bottom = Math.round(vector[6] * outputHeight);
+                        width = right - left + 1;
+                        height = bottom - top + 1;
+                    }
+                    let box = {
+                        classId: vector[1] - 1,
+                        confidence: confidence,
+                        bounding: [left, top, width, height],
+                        toDraw: true
+                    }
+                    boxes.push(box);
+                }
+            }
+        } break;
+        default:
+            console.error(`Unsupported output type ${outType}`)
+    }
+
+    // Draw the saved box into the image
+    let output = new cv.Mat(outputWidth, outputHeight, cv.CV_8UC3);
+    cv.cvtColor(frame, output, cv.COLOR_RGBA2RGB);
+    let boxNum = boxes.length;
+    for (let i = 0; i < boxNum; ++i) {
+        if (boxes[i].toDraw) {
+            drawBox(boxes[i]);
+        }
+    }
+
+    return output;
+
+
+    // Calculate the IOU(Intersection over Union) of two boxes
+    function IOU(box1, box2) {
+        let bounding1 = box1.bounding;
+        let bounding2 = box2.bounding;
+        let s1 = bounding1[2] * bounding1[3];
+        let s2 = bounding2[2] * bounding2[3];
+
+        let left1 = bounding1[0];
+        let right1 = left1 + bounding1[2];
+        let left2 = bounding2[0];
+        let right2 = left2 + bounding2[2];
+        let overlapW = calOverlap([left1, right1], [left2, right2]);
+
+        let top1 = bounding2[1];
+        let bottom1 = top1 + bounding1[3];
+        let top2 = bounding2[1];
+        let bottom2 = top2 + bounding2[3];
+        let overlapH = calOverlap([top1, bottom1], [top2, bottom2]);
+
+        let overlapS = overlapW * overlapH;
+        return overlapS / (s1 + s2 + overlapS);
+    }
+
+    // Calculate the overlap range of two vector
+    function calOverlap(range1, range2) {
+        let min1 = range1[0];
+        let max1 = range1[1];
+        let min2 = range2[0];
+        let max2 = range2[1];
+
+        if (min2 > min1 && min2 < max1) {
+            return max1 - min2;
+        } else if (max2 > min1 && max2 < max1) {
+            return max2 - min1;
+        } else {
+            return 0;
+        }
+    }
+
+    // Draw one predict box into the origin image
+    function drawBox(box) {
+        let bounding = box.bounding;
+        let left = bounding[0];
+        let top = bounding[1];
+        let width = bounding[2];
+        let height = bounding[3];
+
+        cv.rectangle(output, new cv.Point(left, top), new cv.Point(left + width, top + height),
+                             new cv.Scalar(0, 255, 0));
+        cv.rectangle(output, new cv.Point(left, top), new cv.Point(left + width, top + 15),
+                             new cv.Scalar(255, 255, 255), cv.FILLED);
+        let text = `${labels[box.classId]}: ${box.confidence.toFixed(4)}`;
+        cv.putText(output, text, new cv.Point(left, top + 10), cv.FONT_HERSHEY_SIMPLEX, 0.3,
+                                 new cv.Scalar(0, 0, 0));
+    }
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_object_detection_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let loadLablesCode = 'loadLables = ' + loadLables.toString();
+    document.getElementById('codeEditor2').value = loadLablesCode;
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor3').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor4').value = loadModelCode;
+
+    utils.loadCode('codeSnippet5', 'codeEditor5');
+
+    let videoInput = document.getElementById('videoInput');
+    let streaming = false;
+    let startAndStop = document.getElementById('startAndStop');
+    startAndStop.addEventListener('click', () => {
+        if (!streaming) {
+            utils.clearError();
+            utils.startCamera('qvga', onVideoStarted, 'videoInput');
+        } else {
+            utils.stopCamera();
+            onVideoStopped();
+        }
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        startAndStop.removeAttribute('disabled');
+    });
+
+    var main = async function(frame) {};
+    var postProcess = function(result, labels, frame) {};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+    utils.executeCode('codeEditor5');
+
+    function onVideoStarted() {
+        streaming = true;
+        startAndStop.innerText = 'Stop';
+        videoInput.width = videoInput.videoWidth;
+        videoInput.height = videoInput.videoHeight;
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+    }
+
+    function onVideoStopped() {
+        streaming = false;
+        startAndStop.innerText = 'Start';
+        initStatus();
+    }
+
+    function updateResult(output, time) {
+        try{
+            let canvasOutput = document.getElementById('canvasOutput');
+            canvasOutput.style.visibility = "visible";
+            cv.imshow('canvasOutput', output);
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                           <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('canvasOutput').style.visibility = "hidden";
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_pose_estimation.html b/doc/js_tutorials/js_assets/js_pose_estimation.html
new file mode 100644
index 0000000000..19c64663d1
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_pose_estimation.html
@@ -0,0 +1,327 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Pose Estimation Example</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Pose Estimation Example</h2>
+<p>
+    This tutorial shows you how to write an pose estimation example with OpenCV.js.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configInput</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Try it</b> button to see the result. You can choose any other images.<br>
+</p>
+
+<div class="control"><button id="tryIt" disabled>Try it</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <canvas id="canvasInput" width="400" height="250"></canvas>
+            </td>
+            <td>
+                <canvas id="canvasOutput" style="visibility: hidden;" width="400" height="250"></canvas>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    canvasInput <input type="file" id="fileInput" name="file" accept="image/*">
+                </div>
+            </td>
+            <td>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile" name="file">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="9" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.Main loop in which will read the image from canvas and do inference once.</p>
+    <textarea class="code" rows="15" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.The pairs of keypoints of different dataset.</p>
+    <textarea class="code" rows="30" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+    <p>6.The post-processing, including get the predicted points and draw lines into the image.</p>
+    <textarea class="code" rows="30" cols="100" id="codeEditor5" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [368, 368];
+mean = [0, 0, 0];
+std = 0.00392;
+swapRB = false;
+threshold = 0.1;
+
+// the pairs of keypoint, can be "COCO", "MPI" and "BODY_25"
+dataset = "COCO";
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+main = async function() {
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, 'canvasInput');
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const output = postProcess(result);
+
+    updateResult(output, time);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+</script>
+
+<script id="codeSnippet4" type="text/code-snippet">
+BODY_PARTS = {};
+POSE_PAIRS = [];
+
+if (dataset === 'COCO') {
+    BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                   "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
+                   "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
+                   "LEye": 15, "REar": 16, "LEar": 17, "Background": 18 };
+
+    POSE_PAIRS = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
+                   ["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
+                   ["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
+                   ["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
+                   ["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]
+} else if (dataset === 'MPI') {
+    BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                   "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
+                   "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14,
+                   "Background": 15 }
+
+    POSE_PAIRS = [ ["Head", "Neck"], ["Neck", "RShoulder"], ["RShoulder", "RElbow"],
+                   ["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
+                   ["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
+                   ["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
+} else if (dataset === 'BODY_25') {
+    BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                   "LShoulder": 5, "LElbow": 6, "LWrist": 7, "MidHip": 8, "RHip": 9,
+                   "RKnee": 10, "RAnkle": 11, "LHip": 12, "LKnee": 13, "LAnkle": 14,
+                   "REye": 15, "LEye": 16, "REar": 17, "LEar": 18, "LBigToe": 19,
+                   "LSmallToe": 20, "LHeel": 21, "RBigToe": 22, "RSmallToe": 23,
+                   "RHeel": 24, "Background": 25 }
+
+    POSE_PAIRS = [ ["Neck", "Nose"], ["Neck", "RShoulder"],
+                   ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
+                   ["RElbow", "RWrist"], ["LShoulder", "LElbow"],
+                   ["LElbow", "LWrist"], ["Nose", "REye"],
+                   ["REye", "REar"], ["Neck", "LEye"],
+                   ["LEye", "LEar"], ["Neck", "MidHip"],
+                   ["MidHip", "RHip"], ["RHip", "RKnee"],
+                   ["RKnee", "RAnkle"], ["RAnkle", "RBigToe"],
+                   ["RBigToe", "RSmallToe"], ["RAnkle", "RHeel"],
+                   ["MidHip", "LHip"], ["LHip", "LKnee"],
+                   ["LKnee", "LAnkle"], ["LAnkle", "LBigToe"],
+                   ["LBigToe", "LSmallToe"], ["LAnkle", "LHeel"] ]
+}
+</script>
+
+<script id="codeSnippet5" type="text/code-snippet">
+postProcess = function(result) {
+    const resultData = result.data32F;
+    const matSize = result.matSize;
+    const size1 = matSize[1];
+    const size2 = matSize[2];
+    const size3 = matSize[3];
+    const mapSize = size2 * size3;
+
+    let canvasOutput = document.getElementById('canvasOutput');
+    const outputWidth = canvasOutput.width;
+    const outputHeight = canvasOutput.height;
+
+    let image = cv.imread("canvasInput");
+    let output = new cv.Mat(outputWidth, outputHeight, cv.CV_8UC3);
+    cv.cvtColor(image, output, cv.COLOR_RGBA2RGB);
+
+    // get position of keypoints from output
+    let points = [];
+    for (let i = 0; i < Object.keys(BODY_PARTS).length; ++i) {
+        heatMap = resultData.slice(i*mapSize, (i+1)*mapSize);
+
+        let maxIndex = 0;
+        let maxConf = heatMap[0];
+        for (index in heatMap) {
+            if (heatMap[index] > heatMap[maxIndex]) {
+                maxIndex = index;
+                maxConf = heatMap[index];
+            }
+        }
+
+        if (maxConf > threshold) {
+            indexX = maxIndex % size3;
+            indexY = maxIndex / size3;
+
+            x = outputWidth * indexX / size3;
+            y = outputHeight * indexY / size2;
+
+            points[i] = [Math.round(x), Math.round(y)];
+        }
+    }
+
+    // draw the points and lines into the image
+    for (pair of POSE_PAIRS) {
+        partFrom = pair[0];
+        partTo = pair[1];
+        idFrom = BODY_PARTS[partFrom];
+        idTo = BODY_PARTS[partTo];
+        pointFrom = points[idFrom];
+        pointTo = points[idTo];
+
+        if (points[idFrom] && points[idTo]) {
+            cv.line(output, new cv.Point(pointFrom[0], pointFrom[1]),
+                            new cv.Point(pointTo[0], pointTo[1]), new cv.Scalar(0, 255, 0), 3);
+            cv.ellipse(output, new cv.Point(pointFrom[0], pointFrom[1]), new cv.Size(3, 3), 0, 0, 360,
+                               new cv.Scalar(0, 0, 255), cv.FILLED);
+            cv.ellipse(output, new cv.Point(pointTo[0], pointTo[1]), new cv.Size(3, 3), 0, 0, 360,
+                               new cv.Scalar(0, 0, 255), cv.FILLED);
+        }
+    }
+
+    return output;
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_pose_estimation_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor2').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor3').value = loadModelCode;
+
+    utils.loadCode('codeSnippet4', 'codeEditor4');
+    utils.loadCode('codeSnippet5', 'codeEditor5');
+
+    let canvas = document.getElementById('canvasInput');
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = 'roi.jpg';
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+
+    let tryIt = document.getElementById('tryIt');
+    tryIt.addEventListener('click', () => {
+        initStatus();
+        document.getElementById('status').innerHTML = 'Running function main()...';
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+        if (modelPath === "") {
+            document.getElementById('status').innerHTML = 'Runing failed.';
+            utils.printError('Please upload model file by clicking the button first.');
+        } else {
+            setTimeout(main, 1);
+        }
+    });
+
+    let fileInput = document.getElementById('fileInput');
+    fileInput.addEventListener('change', (e) => {
+        initStatus();
+        loadImageToCanvas(e, 'canvasInput');
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        tryIt.removeAttribute('disabled');
+    });
+
+    var main = async function() {};
+    var postProcess = function(result) {};
+
+    utils.executeCode('codeEditor');
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+    utils.executeCode('codeEditor5');
+
+    function updateResult(output, time) {
+        try{
+            let canvasOutput = document.getElementById('canvasOutput');
+            canvasOutput.style.visibility = "visible";
+            let resized = new cv.Mat(canvasOutput.width, canvasOutput.height, cv.CV_8UC4);
+            cv.resize(output, resized, new cv.Size(canvasOutput.width, canvasOutput.height));
+            cv.imshow('canvasOutput', resized);
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                           <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('canvasOutput').style.visibility = "hidden";
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_pose_estimation_model_info.json b/doc/js_tutorials/js_assets/js_pose_estimation_model_info.json
new file mode 100644
index 0000000000..922c813f39
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_pose_estimation_model_info.json
@@ -0,0 +1,34 @@
+{
+    "caffe": [
+        {
+            "model": "body_25",
+            "inputSize": "368, 368",
+            "mean": "0, 0, 0",
+            "std": "0.00392",
+            "swapRB": "false",
+            "dataset": "BODY_25",
+            "modelUrl": "http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/body_25/pose_iter_584000.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/pose/body_25/pose_deploy.prototxt"
+        },
+        {
+            "model": "coco",
+            "inputSize": "368, 368",
+            "mean": "0, 0, 0",
+            "std": "0.00392",
+            "swapRB": "false",
+            "dataset": "COCO",
+            "modelUrl": "http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/coco/pose_iter_440000.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/pose/coco/pose_deploy_linevec.prototxt"
+        },
+        {
+            "model": "mpi",
+            "inputSize": "368, 368",
+            "mean": "0, 0, 0",
+            "std": "0.00392",
+            "swapRB": "false",
+            "dataset": "MPI",
+            "modelUrl": "http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/mpi/pose_iter_160000.caffemodel",
+            "configUrl": "https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/pose/mpi/pose_deploy_linevec.prototxt"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_semantic_segmentation.html b/doc/js_tutorials/js_assets/js_semantic_segmentation.html
new file mode 100644
index 0000000000..6fc27dbd19
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_semantic_segmentation.html
@@ -0,0 +1,243 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Semantic Segmentation Example</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Semantic Segmentation Example</h2>
+<p>
+    This tutorial shows you how to write an semantic segmentation example with OpenCV.js.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configInput</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Try it</b> button to see the result. You can choose any other images.<br>
+</p>
+
+<div class="control"><button id="tryIt" disabled>Try it</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <canvas id="canvasInput" width="400" height="400"></canvas>
+            </td>
+            <td>
+                <canvas id="canvasOutput" style="visibility: hidden;" width="400" height="400"></canvas>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    canvasInput <input type="file" id="fileInput" name="file" accept="image/*">
+                </div>
+            </td>
+            <td>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile" name="file">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="5" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.Main loop in which will read the image from canvas and do inference once.</p>
+    <textarea class="code" rows="16" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.The post-processing, including gengerate colors for different classes and argmax to get the classes for each pixel.</p>
+    <textarea class="code" rows="34" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [513, 513];
+mean = [127.5, 127.5, 127.5];
+std = 0.007843;
+swapRB = false;
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+main = async function() {
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, 'canvasInput');
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const colors = generateColors(result);
+    const output = argmax(result, colors);
+
+    updateResult(output, time);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+</script>
+
+<script id="codeSnippet4" type="text/code-snippet">
+generateColors = function(result) {
+    const numClasses = result.matSize[1];
+    let colors = [0,0,0];
+    while(colors.length < numClasses*3){
+        colors.push(Math.round((Math.random()*255 + colors[colors.length-3]) / 2));
+    }
+    return colors;
+}
+
+argmax = function(result, colors) {
+    const C = result.matSize[1];
+    const H = result.matSize[2];
+    const W = result.matSize[3];
+    const resultData = result.data32F;
+    const imgSize = H*W;
+
+    let classId = [];
+    for (i = 0; i<imgSize; ++i) {
+        let id = 0;
+        for (j = 0; j < C; ++j) {
+            if (resultData[j*imgSize+i] > resultData[id*imgSize+i]) {
+                id = j;
+            }
+        }
+        classId.push(colors[id*3]);
+        classId.push(colors[id*3+1]);
+        classId.push(colors[id*3+2]);
+        classId.push(255);
+    }
+
+    output = cv.matFromArray(H,W,cv.CV_8UC4,classId);
+    return output;
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_semantic_segmentation_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor2').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor3').value = loadModelCode;
+
+    utils.loadCode('codeSnippet4', 'codeEditor4');
+
+    let canvas = document.getElementById('canvasInput');
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = 'roi.jpg';
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+
+    let tryIt = document.getElementById('tryIt');
+    tryIt.addEventListener('click', () => {
+        initStatus();
+        document.getElementById('status').innerHTML = 'Running function main()...';
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+        if (modelPath === "") {
+            document.getElementById('status').innerHTML = 'Runing failed.';
+            utils.printError('Please upload model file by clicking the button first.');
+        } else {
+            setTimeout(main, 1);
+        }
+    });
+
+    let fileInput = document.getElementById('fileInput');
+    fileInput.addEventListener('change', (e) => {
+        initStatus();
+        loadImageToCanvas(e, 'canvasInput');
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        tryIt.removeAttribute('disabled');
+    });
+
+    var main = async function() {};
+    var generateColors = function(result) {};
+    var argmax = function(result, colors) {};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+
+    function updateResult(output, time) {
+        try{
+            let canvasOutput = document.getElementById('canvasOutput');
+            canvasOutput.style.visibility = "visible";
+            let resized = new cv.Mat(canvasOutput.width, canvasOutput.height, cv.CV_8UC4);
+            cv.resize(output, resized, new cv.Size(canvasOutput.width, canvasOutput.height));
+            cv.imshow('canvasOutput', resized);
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                           <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('canvasOutput').style.visibility = "hidden";
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_semantic_segmentation_model_info.json b/doc/js_tutorials/js_assets/js_semantic_segmentation_model_info.json
new file mode 100644
index 0000000000..ef0016af1d
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_semantic_segmentation_model_info.json
@@ -0,0 +1,12 @@
+{
+    "tensorflow": [
+        {
+            "model": "deeplabv3",
+            "inputSize": "513, 513",
+            "mean": "127.5, 127.5, 127.5",
+            "std": "0.007843",
+            "swapRB": "false",
+            "modelUrl": "https://drive.google.com/uc?id=1v-hfGenaE9tiGOzo5qdgMNG_gqQ5-Xn4&export=download"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_style_transfer.html b/doc/js_tutorials/js_assets/js_style_transfer.html
new file mode 100644
index 0000000000..91422e1344
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_style_transfer.html
@@ -0,0 +1,228 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <title>Style Transfer Example</title>
+    <link href="js_example_style.css" rel="stylesheet" type="text/css" />
+</head>
+
+<body>
+<h2>Style Transfer Example</h2>
+<p>
+    This tutorial shows you how to write an style transfer example with OpenCV.js.<br>
+    To try the example you should click the <b>modelFile</b> button(and <b>configFile</b> button if needed) to upload inference model.
+    You can find the model URLs and parameters in the <a href="#appendix">model info</a> section.
+    Then You should change the parameters in the first code snippet according to the uploaded model.
+    Finally click <b>Try it</b> button to see the result. You can choose any other images.<br>
+</p>
+
+<div class="control"><button id="tryIt" disabled>Try it</button></div>
+<div>
+    <table cellpadding="0" cellspacing="0" width="0" border="0">
+        <tr>
+            <td>
+                <canvas id="canvasInput" width="400" height="400"></canvas>
+            </td>
+            <td>
+                <canvas id="canvasOutput" style="visibility: hidden;" width="400" height="400"></canvas>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    canvasInput <input type="file" id="fileInput" name="file" accept="image/*">
+                </div>
+            </td>
+            <td>
+                <p id='status' align="left"></p>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    modelFile <input type="file" id="modelFile" name="file">
+                </div>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <div class="caption">
+                    configFile <input type="file" id="configFile">
+                </div>
+            </td>
+        </tr>
+    </table>
+</div>
+
+<div>
+    <p class="err" id="errorMessage"></p>
+</div>
+
+<div>
+    <h3>Help function</h3>
+    <p>1.The parameters for model inference which you can modify to investigate more models.</p>
+    <textarea class="code" rows="5" cols="100" id="codeEditor" spellcheck="false"></textarea>
+    <p>2.Main loop in which will read the image from canvas and do inference once.</p>
+    <textarea class="code" rows="15" cols="100" id="codeEditor1" spellcheck="false"></textarea>
+    <p>3.Get blob from image as input for net, and standardize it with <b>mean</b> and <b>std</b>.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor2" spellcheck="false"></textarea>
+    <p>4.Fetch model file and save to emscripten file system once click the input button.</p>
+    <textarea class="code" rows="17" cols="100" id="codeEditor3" spellcheck="false"></textarea>
+    <p>5.The post-processing, including scaling and reordering.</p>
+    <textarea class="code" rows="21" cols="100" id="codeEditor4" spellcheck="false"></textarea>
+</div>
+
+<div id="appendix">
+    <h2>Model Info:</h2>
+</div>
+
+<script src="utils.js" type="text/javascript"></script>
+<script src="js_dnn_example_helper.js" type="text/javascript"></script>
+
+<script id="codeSnippet" type="text/code-snippet">
+inputSize = [224, 224];
+mean = [104, 117, 123];
+std = 1;
+swapRB = false;
+</script>
+
+<script id="codeSnippet1" type="text/code-snippet">
+main = async function() {
+    const input = getBlobFromImage(inputSize, mean, std, swapRB, 'canvasInput');
+    let net = cv.readNet(configPath, modelPath);
+    net.setInput(input);
+    const start = performance.now();
+    const result = net.forward();
+    const time  = performance.now()-start;
+    const output = postProcess(result);
+
+    updateResult(output, time);
+    input.delete();
+    net.delete();
+    result.delete();
+}
+</script>
+
+<script id="codeSnippet4" type="text/code-snippet">
+postProcess = function(result) {
+    const resultData = result.data32F;
+    const C = result.matSize[1];
+    const H = result.matSize[2];
+    const W = result.matSize[3];
+    const mean = [104, 117, 123];
+
+    let normData = [];
+    for (let h = 0; h < H; ++h) {
+        for (let w = 0; w < W; ++w) {
+            for (let c = 0; c < C; ++c) {
+                normData.push(resultData[c*H*W + h*W + w] + mean[c]);
+            }
+            normData.push(255);
+        }
+    }
+
+    let output = new cv.matFromArray(H, W, cv.CV_8UC4, normData);
+    return output;
+}
+</script>
+
+<script type="text/javascript">
+    let jsonUrl = "js_style_transfer_model_info.json";
+    drawInfoTable(jsonUrl, 'appendix');
+
+    let utils = new Utils('errorMessage');
+    utils.loadCode('codeSnippet', 'codeEditor');
+    utils.loadCode('codeSnippet1', 'codeEditor1');
+
+    let getBlobFromImageCode = 'getBlobFromImage = ' + getBlobFromImage.toString();
+    document.getElementById('codeEditor2').value = getBlobFromImageCode;
+    let loadModelCode = 'loadModel = ' + loadModel.toString();
+    document.getElementById('codeEditor3').value = loadModelCode;
+
+    utils.loadCode('codeSnippet4', 'codeEditor4');
+
+    let canvas = document.getElementById('canvasInput');
+    let ctx = canvas.getContext('2d');
+    let img = new Image();
+    img.crossOrigin = 'anonymous';
+    img.src = 'lena.png';
+    img.onload = function() {
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    };
+
+    let tryIt = document.getElementById('tryIt');
+    tryIt.addEventListener('click', () => {
+        initStatus();
+        document.getElementById('status').innerHTML = 'Running function main()...';
+        utils.executeCode('codeEditor');
+        utils.executeCode('codeEditor1');
+        if (modelPath === "") {
+            document.getElementById('status').innerHTML = 'Runing failed.';
+            utils.printError('Please upload model file by clicking the button first.');
+        } else {
+            setTimeout(main, 1);
+        }
+    });
+
+    let fileInput = document.getElementById('fileInput');
+    fileInput.addEventListener('change', (e) => {
+        initStatus();
+        loadImageToCanvas(e, 'canvasInput');
+    });
+
+    let configPath = "";
+    let configFile = document.getElementById('configFile');
+    configFile.addEventListener('change', async (e) => {
+        initStatus();
+        configPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The config file '${configPath}' is created successfully.`;
+    });
+
+    let modelPath = "";
+    let modelFile = document.getElementById('modelFile');
+    modelFile.addEventListener('change', async (e) => {
+        initStatus();
+        modelPath = await loadModel(e);
+        document.getElementById('status').innerHTML = `The model file '${modelPath}' is created successfully.`;
+        configPath = "";
+        configFile.value = "";
+    });
+
+    utils.loadOpenCv(() => {
+        tryIt.removeAttribute('disabled');
+    });
+
+    var main = async function() {};
+    var postProcess = function(result) {};
+
+    utils.executeCode('codeEditor1');
+    utils.executeCode('codeEditor2');
+    utils.executeCode('codeEditor3');
+    utils.executeCode('codeEditor4');
+
+    function updateResult(output, time) {
+        try{
+            let canvasOutput = document.getElementById('canvasOutput');
+            canvasOutput.style.visibility = "visible";
+            let resized = new cv.Mat(canvasOutput.width, canvasOutput.height, cv.CV_8UC4);
+            cv.resize(output, resized, new cv.Size(canvasOutput.width, canvasOutput.height));
+            cv.imshow('canvasOutput', resized);
+            document.getElementById('status').innerHTML = `<b>Model:</b> ${modelPath}<br>
+                                                           <b>Inference time:</b> ${time.toFixed(2)} ms`;
+        } catch(e) {
+            console.log(e);
+        }
+    }
+
+    function initStatus() {
+        document.getElementById('status').innerHTML = '';
+        document.getElementById('canvasOutput').style.visibility = "hidden";
+        utils.clearError();
+    }
+
+</script>
+
+</body>
+
+</html>
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_style_transfer_model_info.json b/doc/js_tutorials/js_assets/js_style_transfer_model_info.json
new file mode 100644
index 0000000000..9cc66018a0
--- /dev/null
+++ b/doc/js_tutorials/js_assets/js_style_transfer_model_info.json
@@ -0,0 +1,76 @@
+{
+    "torch": [
+        {
+            "model": "candy.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/candy.t7"
+        },
+        {
+            "model": "composition_vii.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//eccv16/composition_vii.t7"
+        },
+        {
+            "model": "feathers.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/feathers.t7"
+        },
+        {
+            "model": "la_muse.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/la_muse.t7"
+        },
+        {
+            "model": "mosaic.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/mosaic.t7"
+        },
+        {
+            "model": "starry_night.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//eccv16/starry_night.t7"
+        },
+        {
+            "model": "the_scream.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/the_scream.t7"
+        },
+        {
+            "model": "the_wave.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//eccv16/the_wave.t7"
+        },
+        {
+            "model": "udnie.t7",
+            "inputSize": "224, 224",
+            "mean": "104, 117, 123",
+            "std": "1",
+            "swapRB": "false",
+            "modelUrl": "https://cs.stanford.edu/people/jcjohns/fast-neural-style/models//instance_norm/udnie.t7"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
index ad2bb54c48..b9f6871ec0 100644
--- a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
+++ b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
@@ -74,7 +74,8 @@ let utils = new Utils('errorMessage');
 utils.loadCode('codeSnippet', 'codeEditor');
 utils.loadImageToCanvas('lena.jpg', 'imageCanvasInput');
 utils.loadImageToCanvas('lenaFace.png', 'templateCanvasInput');
-utils.addFileInputHandler('fileInput', 'canvasInput');
+utils.addFileInputHandler('fileInput', 'imageCanvasInput');
+utils.addFileInputHandler('templateFileInput', 'templateCanvasInput');
 
 let tryIt = document.getElementById('tryIt');
 tryIt.addEventListener('click', () => {
diff --git a/doc/js_tutorials/js_assets/utils.js b/doc/js_tutorials/js_assets/utils.js
index 4d5deb0b51..65f6d1782d 100644
--- a/doc/js_tutorials/js_assets/utils.js
+++ b/doc/js_tutorials/js_assets/utils.js
@@ -7,7 +7,7 @@ function Utils(errorOutputId) { // eslint-disable-line no-unused-vars
         let script = document.createElement('script');
         script.setAttribute('async', '');
         script.setAttribute('type', 'text/javascript');
-        script.addEventListener('load', () => {
+        script.addEventListener('load', async () => {
             if (cv.getBuildInformation)
             {
                 console.log(cv.getBuildInformation());
@@ -16,9 +16,15 @@ function Utils(errorOutputId) { // eslint-disable-line no-unused-vars
             else
             {
                 // WASM
-                cv['onRuntimeInitialized']=()=>{
+                if (cv instanceof Promise) {
+                    cv = await cv;
                     console.log(cv.getBuildInformation());
                     onloadCallback();
+                } else {
+                    cv['onRuntimeInitialized']=()=>{
+                        console.log(cv.getBuildInformation());
+                        onloadCallback();
+                    }
                 }
             }
         });
diff --git a/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification.markdown b/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification.markdown
new file mode 100644
index 0000000000..1a94f8d14a
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification.markdown
@@ -0,0 +1,13 @@
+Image Classification Example {#tutorial_js_image_classification}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for image classification.
+
+\htmlonly
+<iframe src="../../js_image_classification.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification_with_camera.markdown b/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification_with_camera.markdown
new file mode 100644
index 0000000000..bdf11161fc
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_image_classification/js_image_classification_with_camera.markdown
@@ -0,0 +1,15 @@
+Image Classification Example with Camera {#tutorial_js_image_classification_with_camera}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for image classification example with camera.
+
+@note  If you don't know how to capture video from camera, please review @ref tutorial_js_video_display.
+
+\htmlonly
+<iframe src="../../js_image_classification_with_camera.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection.markdown b/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection.markdown
new file mode 100644
index 0000000000..980b45c236
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection.markdown
@@ -0,0 +1,13 @@
+Object Detection Example {#tutorial_js_object_detection}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for object detection.
+
+\htmlonly
+<iframe src="../../js_object_detection.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection_with_camera.markdown b/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection_with_camera.markdown
new file mode 100644
index 0000000000..e6e8f6f957
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_object_detection/js_object_detection_with_camera.markdown
@@ -0,0 +1,13 @@
+Object Detection Example with Camera{#tutorial_js_object_detection_with_camera}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for object detection with camera.
+
+\htmlonly
+<iframe src="../../js_object_detection_with_camera.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_pose_estimation/js_pose_estimation.markdown b/doc/js_tutorials/js_dnn/js_pose_estimation/js_pose_estimation.markdown
new file mode 100644
index 0000000000..b090ff2cfb
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_pose_estimation/js_pose_estimation.markdown
@@ -0,0 +1,13 @@
+Pose Estimation Example {#tutorial_js_pose_estimation}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for pose estimation.
+
+\htmlonly
+<iframe src="../../js_pose_estimation.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_semantic_segmentation/js_semantic_segmentation.markdown b/doc/js_tutorials/js_dnn/js_semantic_segmentation/js_semantic_segmentation.markdown
new file mode 100644
index 0000000000..50177fb549
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_semantic_segmentation/js_semantic_segmentation.markdown
@@ -0,0 +1,13 @@
+Semantic Segmentation Example {#tutorial_js_semantic_segmentation}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for semantic segmentation.
+
+\htmlonly
+<iframe src="../../js_semantic_segmentation.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_style_transfer/js_style_transfer.markdown b/doc/js_tutorials/js_dnn/js_style_transfer/js_style_transfer.markdown
new file mode 100644
index 0000000000..7c1799ac6a
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_style_transfer/js_style_transfer.markdown
@@ -0,0 +1,13 @@
+Style Transfer Example {#tutorial_js_style_transfer}
+=======================================
+
+Goal
+----
+
+- In this tutorial you will learn how to use OpenCV.js dnn module for style transfer.
+
+\htmlonly
+<iframe src="../../js_style_transfer.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
\ No newline at end of file
diff --git a/doc/js_tutorials/js_dnn/js_table_of_contents_dnn.markdown b/doc/js_tutorials/js_dnn/js_table_of_contents_dnn.markdown
new file mode 100644
index 0000000000..e008dc81d1
--- /dev/null
+++ b/doc/js_tutorials/js_dnn/js_table_of_contents_dnn.markdown
@@ -0,0 +1,30 @@
+Deep Neural Networks (dnn module) {#tutorial_js_table_of_contents_dnn}
+============
+
+-   @subpage tutorial_js_image_classification
+
+    Image classification example
+
+-   @subpage tutorial_js_image_classification_with_camera
+
+    Image classification example with camera
+
+-   @subpage tutorial_js_object_detection
+
+    Object detection example
+
+-   @subpage tutorial_js_object_detection_with_camera
+
+    Object detection example with camera
+
+-   @subpage tutorial_js_semantic_segmentation
+
+    Semantic segmentation example
+
+-   @subpage tutorial_js_style_transfer
+
+    Style transfer example
+
+-   @subpage tutorial_js_pose_estimation
+
+    Pose estimation example
diff --git a/doc/js_tutorials/js_imgproc/js_intelligent_scissors/js_intelligent_scissors.markdown b/doc/js_tutorials/js_imgproc/js_intelligent_scissors/js_intelligent_scissors.markdown
new file mode 100644
index 0000000000..1a3ca4c484
--- /dev/null
+++ b/doc/js_tutorials/js_imgproc/js_intelligent_scissors/js_intelligent_scissors.markdown
@@ -0,0 +1,16 @@
+Intelligent Scissors Demo {#tutorial_js_intelligent_scissors}
+=========================
+
+Goal
+----
+
+- Here you can check how to use IntelligentScissors tool for image segmentation task.
+- Available methods and parameters: @ref cv::segmentation::IntelligentScissorsMB
+
+@note The feature is integrated into [CVAT](https://github.com/openvinotoolkit/cvat) annotation tool and you can try it online on https://cvat.org
+
+\htmlonly
+<iframe src="../../js_intelligent_scissors.html" width="100%"
+        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
+</iframe>
+\endhtmlonly
diff --git a/doc/js_tutorials/js_imgproc/js_table_of_contents_imgproc.markdown b/doc/js_tutorials/js_imgproc/js_table_of_contents_imgproc.markdown
index 3bb809be71..b06eb95639 100644
--- a/doc/js_tutorials/js_imgproc/js_table_of_contents_imgproc.markdown
+++ b/doc/js_tutorials/js_imgproc/js_table_of_contents_imgproc.markdown
@@ -77,3 +77,7 @@ Image Processing {#tutorial_js_table_of_contents_imgproc}
 -   @subpage tutorial_js_imgproc_camera
 
     Learn image processing for video capture.
+
+-   @subpage tutorial_js_intelligent_scissors
+
+    Learn how to use IntelligentScissors tool for image segmentation task.
diff --git a/doc/js_tutorials/js_setup/js_intro/js_intro.markdown b/doc/js_tutorials/js_setup/js_intro/js_intro.markdown
index 416aa3ded5..01a123c5f4 100644
--- a/doc/js_tutorials/js_setup/js_intro/js_intro.markdown
+++ b/doc/js_tutorials/js_setup/js_intro/js_intro.markdown
@@ -13,7 +13,7 @@ OpenCV.js: OpenCV for the JavaScript programmer
 
 Web is the most ubiquitous open computing platform. With HTML5 standards implemented in every browser, web applications are able to render online video with HTML5 video tags, capture webcam video via WebRTC API, and access each pixel of a video frame via canvas API. With abundance of available multimedia content, web developers are in need of a wide array of image and vision processing algorithms in JavaScript to build innovative applications. This requirement is even more essential for emerging applications on the web, such as Web Virtual Reality (WebVR) and Augmented Reality (WebAR). All of these use cases demand efficient implementations of computation-intensive vision kernels on web.
 
-[Emscripten](http://kripken.github.io/emscripten-site) is an LLVM-to-JavaScript compiler. It takes LLVM bitcode - which can be generated from C/C++ using clang, and compiles that into asm.js or WebAssembly that can execute directly inside the web browsers. .  Asm.js is a highly optimizable, low-level subset of JavaScript. Asm.js enables ahead-of-time compilation and optimization in JavaScript engine that provide near-to-native execution speed. WebAssembly is a new portable, size- and load-time-efficient binary format suitable for compilation to the web. WebAssembly aims to execute at native speed. WebAssembly is currently being designed as an open standard by W3C.
+[Emscripten](https://emscripten.org/) is an LLVM-to-JavaScript compiler. It takes LLVM bitcode - which can be generated from C/C++ using clang, and compiles that into asm.js or WebAssembly that can execute directly inside the web browsers. .  Asm.js is a highly optimizable, low-level subset of JavaScript. Asm.js enables ahead-of-time compilation and optimization in JavaScript engine that provide near-to-native execution speed. WebAssembly is a new portable, size- and load-time-efficient binary format suitable for compilation to the web. WebAssembly aims to execute at native speed. WebAssembly is currently being designed as an open standard by W3C.
 
 OpenCV.js is a JavaScript binding for selected subset of OpenCV functions for the web platform. It allows emerging web applications with multimedia processing to benefit from the wide variety of vision functions available in OpenCV. OpenCV.js leverages Emscripten to compile OpenCV functions into asm.js or WebAssembly targets, and provides a JavaScript APIs for web application to access them. The future versions of the library will take advantage of acceleration APIs that are available on the Web such as SIMD and multi-threaded execution.
 
@@ -42,4 +42,4 @@ Below is the list of contributors of OpenCV.js bindings and tutorials.
 -  Gang Song (GSoC student, Shanghai Jiao Tong University)
 -  Wenyao Gan (Student intern, Shanghai Jiao Tong University)
 -  Mohammad Reza Haghighat (Project initiator & sponsor, Intel Corporation)
--  Ningxin Hu (Students' supervisor, Intel Corporation)
\ No newline at end of file
+-  Ningxin Hu (Students' supervisor, Intel Corporation)
diff --git a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
index 435f06fe02..ad14185a35 100644
--- a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
+++ b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
@@ -7,12 +7,12 @@ You don't have to build your own copy if you simply want to start using it. Refe
 Installing Emscripten
 -----------------------------
 
-[Emscripten](https://github.com/kripken/emscripten) is an LLVM-to-JavaScript compiler. We will use Emscripten to build OpenCV.js.
+[Emscripten](https://github.com/emscripten-core/emscripten) is an LLVM-to-JavaScript compiler. We will use Emscripten to build OpenCV.js.
 
 @note
 While this describes installation of required tools from scratch, there's a section below also describing an alternative procedure to perform the same build using docker containers which is often easier.
 
-To Install Emscripten, follow instructions of [Emscripten SDK](https://kripken.github.io/emscripten-site/docs/getting_started/downloads.html).
+To Install Emscripten, follow instructions of [Emscripten SDK](https://emscripten.org/docs/getting_started/downloads.html).
 
 For example:
 @code{.bash}
@@ -21,24 +21,29 @@ For example:
 ./emsdk activate latest
 @endcode
 
-@note
-To compile to [WebAssembly](http://webassembly.org), you need to install and activate [Binaryen](https://github.com/WebAssembly/binaryen) with the `emsdk` command. Please refer to [Developer's Guide](http://webassembly.org/getting-started/developers-guide/) for more details.
 
-After install, ensure the `EMSCRIPTEN` environment is setup correctly.
+After install, ensure the `EMSDK` environment is setup correctly.
 
 For example:
 @code{.bash}
 source ./emsdk_env.sh
-echo ${EMSCRIPTEN}
+echo ${EMSDK}
 @endcode
 
-The version 1.39.16 of emscripten is verified for latest WebAssembly. Please check the version of emscripten to use the newest features of WebAssembly.
+Modern versions of Emscripten requires to use `emcmake` / `emmake` launchers:
+
+@code{.bash}
+emcmake sh -c 'echo ${EMSCRIPTEN}'
+@endcode
+
+
+The version 2.0.10 of emscripten is verified for latest WebAssembly. Please check the version of Emscripten to use the newest features of WebAssembly.
 
 For example:
 @code{.bash}
 ./emsdk update
-./emsdk install 1.39.16
-./emsdk activate 1.39.16
+./emsdk install 2.0.10
+./emsdk activate 2.0.10
 @endcode
 
 Obtaining OpenCV Source Code
@@ -71,8 +76,7 @@ Building OpenCV.js from Source
 
     For example, to build in `build_js` directory:
     @code{.bash}
-    cd opencv
-    python ./platforms/js/build_js.py build_js
+    emcmake python ./opencv/platforms/js/build_js.py build_js
     @endcode
 
     @note
@@ -82,14 +86,14 @@ Building OpenCV.js from Source
 
     For example, to build wasm version in `build_wasm` directory:
     @code{.bash}
-    python ./platforms/js/build_js.py build_wasm --build_wasm
+    emcmake python ./opencv/platforms/js/build_js.py build_wasm --build_wasm
     @endcode
 
 -#  [Optional] To build the OpenCV.js loader, append `--build_loader`.
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_loader
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_loader
     @endcode
 
     @note
@@ -114,7 +118,7 @@ Building OpenCV.js from Source
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_doc
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_doc
     @endcode
 
     @note
@@ -124,9 +128,24 @@ Building OpenCV.js from Source
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_test
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_test
     @endcode
 
+-#  [optional] To enable OpenCV contrib modules append `--cmake_option="-DOPENCV_EXTRA_MODULES_PATH=/path/to/opencv_contrib/modules/"`
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --cmake_option="-DOPENCV_EXTRA_MODULES_PATH=opencv_contrib/modules"
+    @endcode
+
+-#  [optional] To enable OpenCV contrib modules append `--cmake_option="-DOPENCV_EXTRA_MODULES_PATH=/path/to/opencv_contrib/modules/"`
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --cmake_option="-DOPENCV_EXTRA_MODULES_PATH=opencv_contrib/modules"
+    @endcode
+
+
 Running OpenCV.js Tests
 ---------------------------------------
 
@@ -186,7 +205,7 @@ node tests.js
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_wasm --threads
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_wasm --threads
     @endcode
 
     The default threads number is the logic core number of your device. You can use `cv.parallel_pthreads_set_threads_num(number)` to set threads number by yourself and use `cv.parallel_pthreads_get_threads_num()` to get the current threads number.
@@ -198,7 +217,7 @@ node tests.js
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_wasm --simd
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_wasm --simd
     @endcode
 
     The simd optimization is experimental as wasm simd is still in development.
@@ -222,7 +241,7 @@ node tests.js
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_wasm --simd --build_wasm_intrin_test
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_wasm --simd --build_wasm_intrin_test
     @endcode
 
     For wasm intrinsics tests, you can use the following function to test all the cases:
@@ -250,7 +269,7 @@ node tests.js
 
     For example:
     @code{.bash}
-    python ./platforms/js/build_js.py build_js --build_perf
+    emcmake python ./opencv/platforms/js/build_js.py build_js --build_perf
     @endcode
 
     To run performance tests, launch a local web server in \<build_dir\>/bin folder. For example, node http-server which serves on `localhost:8080`.
@@ -271,25 +290,31 @@ Building OpenCV.js with Docker
 
 Alternatively, the same build can be can be accomplished using [docker](https://www.docker.com/) containers which is often easier and more reliable, particularly in non linux systems. You only need to install [docker](https://www.docker.com/) on your system and use a popular container that provides a clean well tested environment for emscripten builds like this, that already has latest versions of all the necessary tools installed.
 
-So, make sure [docker](https://www.docker.com/) is installed in your system and running. The following shell script should work in linux and MacOS:
+So, make sure [docker](https://www.docker.com/) is installed in your system and running. The following shell script should work in Linux and MacOS:
 
 @code{.bash}
 git clone https://github.com/opencv/opencv.git
 cd opencv
-docker run --rm --workdir /code -v "$PWD":/code "trzeci/emscripten:latest" python ./platforms/js/build_js.py build
+docker run --rm -v $(pwd):/src -u $(id -u):$(id -g) emscripten/emsdk emcmake python3 ./platforms/js/build_js.py build_js
 @endcode
 
 In Windows use the following PowerShell command:
 
 @code{.bash}
-docker run --rm --workdir /code -v "$(get-location):/code" "trzeci/emscripten:latest" python ./platforms/js/build_js.py build
+docker run --rm --workdir /src -v "$(get-location):/src" "emscripten/emsdk" emcmake python3 ./platforms/js/build_js.py build_js
 @endcode
 
 @warning
-The example uses latest version of emscripten. If the build fails you should try a version that is known to work fine which is `1.38.32` using the following command:
+The example uses latest version of emscripten. If the build fails you should try a version that is known to work fine which is `2.0.10` using the following command:
 
 @code{.bash}
-docker run --rm --workdir /code -v "$PWD":/code "trzeci/emscripten:sdk-tag-1.38.32-64bit" python ./platforms/js/build_js.py build
+docker run --rm -v $(pwd):/src -u $(id -u):$(id -g) emscripten/emsdk:2.0.10 emcmake python3 ./platforms/js/build_js.py build_js
+@endcode
+
+In Windows use the following PowerShell command:
+
+@code{.bash}
+docker run --rm --workdir /src -v "$(get-location):/src" "emscripten/emsdk:2.0.10" emcmake python3 ./platforms/js/build_js.py build_js
 @endcode
 
 ### Building the documentation with Docker
@@ -297,10 +322,11 @@ docker run --rm --workdir /code -v "$PWD":/code "trzeci/emscripten:sdk-tag-1.38.
 To build the documentation `doxygen` needs to be installed. Create a file named `Dockerfile` with the following content:
 
 ```
-FROM trzeci/emscripten:sdk-tag-1.38.32-64bit
+FROM emscripten/emsdk:2.0.10
 
-RUN apt-get update -y
-RUN apt-get install -y doxygen
+RUN apt-get update \
+  && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends doxygen \
+  && rm -rf /var/lib/apt/lists/*
 ```
 
 Then we build the docker image and name it `opencv-js-doc` with the following command (that needs to be run only once):
@@ -312,5 +338,5 @@ docker build . -t opencv-js-doc
 Now run the build command again, this time using the new image and passing `--build_doc`:
 
 @code{.bash}
-docker run --rm --workdir /code -v "$PWD":/code "opencv-js-doc" python ./platforms/js/build_js.py build --build_doc
+docker run --rm -v $(pwd):/src -u $(id -u):$(id -g) "opencv-js-doc" emcmake python3 ./platforms/js/build_js.py build_js --build_doc
 @endcode
diff --git a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
index 5f9f338f2d..5a8c3b87fa 100644
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@@ -4,7 +4,7 @@ Using OpenCV.js {#tutorial_js_usage}
 Steps
 -----
 
-In this tutorial, you will learn how to include and start to use `opencv.js` inside a web page. You can get a copy of `opencv.js` from `opencv-{VERSION_NUMBER}-docs.zip` in each [release](https://github.com/opencv/opencv/releases), or simply download the prebuilt script from the online documentations at "https://docs.opencv.org/{VERISON_NUMBER}/opencv.js" (For example, [https://docs.opencv.org/3.4.0/opencv.js](https://docs.opencv.org/3.4.0/opencv.js). Use `master` if you want the latest build). You can also build your own copy by following the tutorial on Build Opencv.js.
+In this tutorial, you will learn how to include and start to use `opencv.js` inside a web page. You can get a copy of `opencv.js` from `opencv-{VERSION_NUMBER}-docs.zip` in each [release](https://github.com/opencv/opencv/releases), or simply download the prebuilt script from the online documentations at "https://docs.opencv.org/{VERSION_NUMBER}/opencv.js" (For example, [https://docs.opencv.org/3.4.0/opencv.js](https://docs.opencv.org/3.4.0/opencv.js). Use `master` if you want the latest build). You can also build your own copy by following the tutorial on Build Opencv.js.
 
 ### Create a web page
 
@@ -82,7 +82,7 @@ In this tutorial, we just show a cv.Mat on screen. To show a cv.Mat, you need a
 
 You can use cv.imshow to show cv.Mat on the canvas.
 @code{.js}
-cv.imshow(mat, "outputCanvas");
+cv.imshow("outputCanvas", mat);
 @endcode
 
 Putting all of the steps together, the final index.html is shown below.
@@ -129,7 +129,7 @@ function onOpenCvReady() {
 </html>
 @endcode
 
-@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memory management of Emscripten](https://kripken.github.io/emscripten-site/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.
+@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memory management of Emscripten](https://emscripten.org/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.
 
 Try it
 ------
@@ -137,4 +137,4 @@ Try it
 <iframe src="../../js_setup_usage.html" width="100%"
         onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
 </iframe>
-\endhtmlonly
\ No newline at end of file
+\endhtmlonly
diff --git a/doc/js_tutorials/js_tutorials.markdown b/doc/js_tutorials/js_tutorials.markdown
index c8a8f92a31..73e69daa98 100644
--- a/doc/js_tutorials/js_tutorials.markdown
+++ b/doc/js_tutorials/js_tutorials.markdown
@@ -26,3 +26,7 @@ OpenCV.js Tutorials {#tutorial_js_root}
 
     In this section you
     will object detection techniques like face detection etc.
+
+-   @subpage tutorial_js_table_of_contents_dnn
+
+    These tutorials show how to use dnn module in JavaScript
diff --git a/doc/opencv.bib b/doc/opencv.bib
index 54396d6a10..d44b0f5293 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -110,6 +110,29 @@
   year = {2010},
   url = {http://ingmec.ual.es/~jlblanco/papers/jlblanco2010geometry3D_techrep.pdf}
 }
+@inproceedings{Bolelli2017,
+  title = {{Two More Strategies to Speed Up Connected Components Labeling Algorithms}},
+  author = {Bolelli, Federico and Cancilla, Michele and Grana, Costantino},
+  year = 2017,
+  booktitle = {Image Analysis and Processing - ICIAP 2017},
+  publisher = {Springer},
+  volume = 10485,
+  pages = {48--58},
+  doi = {10.1007/978-3-319-68548-9_5},
+  isbn = {978-3-319-68547-2}
+}
+@article{Bolelli2019,
+  title = {{Spaghetti Labeling: Directed Acyclic Graphs for Block-Based Connected Components Labeling}},
+  author = {Bolelli, Federico and Allegretti, Stefano and Baraldi, Lorenzo and Grana, Costantino},
+  year = 2019,
+  journal = {IEEE Transactions on Image Processing},
+  publisher = {IEEE},
+  volume = 29,
+  number = 1,
+  pages = {1999--2012},
+  doi = {10.1109/TIP.2019.2946979},
+  issn = {1057-7149}
+}
 @article{Borgefors86,
   author = {Borgefors, Gunilla},
   title = {Distance transformations in digital images},
@@ -420,6 +443,16 @@
   volume = {51},
   pages = {378-384}
 }
+@article{Grana2010,
+  title = {{Optimized Block-Based Connected Components Labeling With Decision Trees}},
+  author = {Grana, Costantino and Borghesani, Daniele and Cucchiara, Rita},
+  year = 2010,
+  journal = {IEEE Transactions on Image Processing},
+  volume = 19,
+  number = 6,
+  pages = {1596--1609},
+  doi = {10.1109/TIP.2010.2044963}
+}
 @article{taubin1991,
   abstract = {The author addresses the problem of parametric representation and estimation of complex planar curves in 2-D surfaces in 3-D, and nonplanar space curves in 3-D. Curves and surfaces can be defined either parametrically or implicitly, with the latter representation used here. A planar curve is the set of zeros of a smooth function of two variables <e1>x</e1>-<e1>y</e1>, a surface is the set of zeros of a smooth function of three variables <e1>x</e1>-<e1>y</e1>-<e1>z</e1>, and a space curve is the intersection of two surfaces, which are the set of zeros of two linearly independent smooth functions of three variables <e1>x</e1>-<e1>y</e1>-<e1>z</e1> For example, the surface of a complex object in 3-D can be represented as a subset of a single implicit surface, with similar results for planar and space curves. It is shown how this unified representation can be used for object recognition, object position estimation, and segmentation of objects into meaningful subobjects, that is, the detection of `interest regions' that are more complex than high curvature regions and, hence, more useful as features for object recognition},
   author = {Taubin, Gabriel},
@@ -768,6 +801,13 @@
   pages = {432--441},
   publisher = {Springer}
 }
+@INPROCEEDINGS{Mortensen95intelligentscissors,
+  author = {Eric N. Mortensen and William A. Barrett},
+  title = {Intelligent Scissors for Image Composition},
+  booktitle = {In Computer Graphics, SIGGRAPH Proceedings},
+  year = {1995},
+  pages = {191--198}
+}
 @inproceedings{Muja2009,
   author = {Muja, Marius and Lowe, David G},
   title = {Fast Approximate Nearest Neighbors with Automatic Algorithm Configuration},
@@ -1261,3 +1301,26 @@
   pages={281--305},
   year={1987}
 }
+@inproceedings{liao2020real,
+  author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang},
+  title={Real-time Scene Text Detection with Differentiable Binarization},
+  booktitle={Proc. AAAI},
+  year={2020}
+}
+@article{shi2016end,
+  title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition},
+  author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
+  journal={IEEE transactions on pattern analysis and machine intelligence},
+  volume={39},
+  number={11},
+  pages={2298--2304},
+  year={2016},
+  publisher={IEEE}
+}
+@inproceedings{zhou2017east,
+  title={East: an efficient and accurate scene text detector},
+  author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={5551--5560},
+  year={2017}
+}
diff --git a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
index 656f5423c5..dee4df774a 100644
--- a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
+++ b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
@@ -20,10 +20,10 @@ scale invariant.
 
 ![image](images/sift_scale_invariant.jpg)
 
-So, in 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
+In 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
 Invariant Feature Transform (SIFT) in his paper, **Distinctive Image Features from Scale-Invariant
 Keypoints**, which extract keypoints and compute its descriptors. *(This paper is easy to understand
-and considered to be best material available on SIFT. So this explanation is just a short summary of
+and considered to be best material available on SIFT. This explanation is just a short summary of
 this paper)*.
 
 There are mainly four steps involved in SIFT algorithm. We will see them one-by-one.
@@ -102,16 +102,17 @@ reasons. In that case, ratio of closest-distance to second-closest distance is t
 greater than 0.8, they are rejected. It eliminates around 90% of false matches while discards only
 5% correct matches, as per the paper.
 
-So this is a summary of SIFT algorithm. For more details and understanding, reading the original
-paper is highly recommended. Remember one thing, this algorithm is patented. So this algorithm is
-included in [the opencv contrib repo](https://github.com/opencv/opencv_contrib)
+This is a summary of SIFT algorithm. For more details and understanding, reading the original
+paper is highly recommended.
 
 SIFT in OpenCV
 --------------
 
-So now let's see SIFT functionalities available in OpenCV. Let's start with keypoint detection and
-draw them. First we have to construct a SIFT object. We can pass different parameters to it which
-are optional and they are well explained in docs.
+Now let's see SIFT functionalities available in OpenCV. Note that these were previously only
+available in [the opencv contrib repo](https://github.com/opencv/opencv_contrib), but the patent
+expired in the year 2020. So they are now included in the main repo. Let's start with keypoint
+detection and draw them. First we have to construct a SIFT object. We can pass different
+parameters to it which are optional and they are well explained in docs.
 @code{.py}
 import numpy as np
 import cv2 as cv
diff --git a/doc/py_tutorials/py_imgproc/py_pyramids/py_pyramids.markdown b/doc/py_tutorials/py_imgproc/py_pyramids/py_pyramids.markdown
index bb31bab107..63fde0a130 100644
--- a/doc/py_tutorials/py_imgproc/py_pyramids/py_pyramids.markdown
+++ b/doc/py_tutorials/py_imgproc/py_pyramids/py_pyramids.markdown
@@ -88,27 +88,27 @@ B = cv.imread('orange.jpg')
 # generate Gaussian pyramid for A
 G = A.copy()
 gpA = [G]
-for i in xrange(6):
+for i in range(6):
     G = cv.pyrDown(G)
     gpA.append(G)
 
 # generate Gaussian pyramid for B
 G = B.copy()
 gpB = [G]
-for i in xrange(6):
+for i in range(6):
     G = cv.pyrDown(G)
     gpB.append(G)
 
 # generate Laplacian Pyramid for A
 lpA = [gpA[5]]
-for i in xrange(5,0,-1):
+for i in range(5,0,-1):
     GE = cv.pyrUp(gpA[i])
     L = cv.subtract(gpA[i-1],GE)
     lpA.append(L)
 
 # generate Laplacian Pyramid for B
 lpB = [gpB[5]]
-for i in xrange(5,0,-1):
+for i in range(5,0,-1):
     GE = cv.pyrUp(gpB[i])
     L = cv.subtract(gpB[i-1],GE)
     lpB.append(L)
@@ -122,7 +122,7 @@ for la,lb in zip(lpA,lpB):
 
 # now reconstruct
 ls_ = LS[0]
-for i in xrange(1,6):
+for i in range(1,6):
     ls_ = cv.pyrUp(ls_)
     ls_ = cv.add(ls_, LS[i])
 
diff --git a/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.markdown b/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.markdown
index 0540098850..f52e9c5db6 100644
--- a/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.markdown
+++ b/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.markdown
@@ -47,7 +47,7 @@ ret,thresh5 = cv.threshold(img,127,255,cv.THRESH_TOZERO_INV)
 titles = ['Original Image','BINARY','BINARY_INV','TRUNC','TOZERO','TOZERO_INV']
 images = [img, thresh1, thresh2, thresh3, thresh4, thresh5]
 
-for i in xrange(6):
+for i in range(6):
     plt.subplot(2,3,i+1),plt.imshow(images[i],'gray',vmin=0,vmax=255)
     plt.title(titles[i])
     plt.xticks([]),plt.yticks([])
@@ -98,7 +98,7 @@ titles = ['Original Image', 'Global Thresholding (v = 127)',
             'Adaptive Mean Thresholding', 'Adaptive Gaussian Thresholding']
 images = [img, th1, th2, th3]
 
-for i in xrange(4):
+for i in range(4):
     plt.subplot(2,2,i+1),plt.imshow(images[i],'gray')
     plt.title(titles[i])
     plt.xticks([]),plt.yticks([])
@@ -153,7 +153,7 @@ titles = ['Original Noisy Image','Histogram','Global Thresholding (v=127)',
           'Original Noisy Image','Histogram',"Otsu's Thresholding",
           'Gaussian filtered Image','Histogram',"Otsu's Thresholding"]
 
-for i in xrange(3):
+for i in range(3):
     plt.subplot(3,3,i*3+1),plt.imshow(images[i*3],'gray')
     plt.title(titles[i*3]), plt.xticks([]), plt.yticks([])
     plt.subplot(3,3,i*3+2),plt.hist(images[i*3].ravel(),256)
@@ -196,7 +196,7 @@ bins = np.arange(256)
 fn_min = np.inf
 thresh = -1
 
-for i in xrange(1,256):
+for i in range(1,256):
     p1,p2 = np.hsplit(hist_norm,[i]) # probabilities
     q1,q2 = Q[i],Q[255]-Q[i] # cum sum of classes
     if q1 < 1.e-6 or q2 < 1.e-6:
diff --git a/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown b/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
index 44b08d53ab..6c4533a1b0 100644
--- a/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
+++ b/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
@@ -268,7 +268,7 @@ fft_filters = [np.fft.fft2(x) for x in filters]
 fft_shift = [np.fft.fftshift(y) for y in fft_filters]
 mag_spectrum = [np.log(np.abs(z)+1) for z in fft_shift]
 
-for i in xrange(6):
+for i in range(6):
     plt.subplot(2,3,i+1),plt.imshow(mag_spectrum[i],cmap = 'gray')
     plt.title(filter_name[i]), plt.xticks([]), plt.yticks([])
 
diff --git a/doc/py_tutorials/py_photo/py_non_local_means/py_non_local_means.markdown b/doc/py_tutorials/py_photo/py_non_local_means/py_non_local_means.markdown
index 3f56a4841b..94e57d4d6e 100644
--- a/doc/py_tutorials/py_photo/py_non_local_means/py_non_local_means.markdown
+++ b/doc/py_tutorials/py_photo/py_non_local_means/py_non_local_means.markdown
@@ -108,7 +108,7 @@ from matplotlib import pyplot as plt
 cap = cv.VideoCapture('vtest.avi')
 
 # create a list of first 5 frames
-img = [cap.read()[1] for i in xrange(5)]
+img = [cap.read()[1] for i in range(5)]
 
 # convert all to grayscale
 gray = [cv.cvtColor(i, cv.COLOR_BGR2GRAY) for i in img]
diff --git a/doc/py_tutorials/py_setup/py_intro/py_intro.markdown b/doc/py_tutorials/py_setup/py_intro/py_intro.markdown
index 487ba72ee7..b013ef014e 100644
--- a/doc/py_tutorials/py_setup/py_intro/py_intro.markdown
+++ b/doc/py_tutorials/py_setup/py_intro/py_intro.markdown
@@ -83,4 +83,4 @@ Additional Resources
 2.  [NumPy Quickstart tutorial](https://numpy.org/devdocs/user/quickstart.html)
 3.  [NumPy Reference](https://numpy.org/devdocs/reference/index.html#reference)
 4.  [OpenCV Documentation](http://docs.opencv.org/)
-5.  [OpenCV Forum](http://answers.opencv.org/questions/)
+5.  [OpenCV Forum](https://forum.opencv.org/)
diff --git a/doc/py_tutorials/py_setup/py_setup_in_ubuntu/py_setup_in_ubuntu.markdown b/doc/py_tutorials/py_setup/py_setup_in_ubuntu/py_setup_in_ubuntu.markdown
index f88ffe6793..8b99c5df92 100644
--- a/doc/py_tutorials/py_setup/py_setup_in_ubuntu/py_setup_in_ubuntu.markdown
+++ b/doc/py_tutorials/py_setup/py_setup_in_ubuntu/py_setup_in_ubuntu.markdown
@@ -22,10 +22,10 @@ Installing OpenCV-Python from Pre-built Binaries
 
 This method serves best when using just for programming and developing OpenCV applications.
 
-Install package [python-opencv](https://packages.ubuntu.com/trusty/python-opencv) with following command in terminal (as root user).
+Install package [python3-opencv](https://packages.ubuntu.com/focal/python3-opencv) with following command in terminal (as root user).
 
 ```
-$ sudo apt-get install python-opencv
+$ sudo apt-get install python3-opencv
 ```
 
 Open Python IDLE (or IPython) and type following codes in Python terminal.
diff --git a/doc/tools/scan_tutorials.py b/doc/tools/scan_tutorials.py
new file mode 100644
index 0000000000..0b924a4626
--- /dev/null
+++ b/doc/tools/scan_tutorials.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+import re
+
+# Tasks
+# 1. Find all tutorials
+# 2. Generate tree (@subpage)
+# 3. Check prev/next nodes
+
+class Tutorial(object):
+    def __init__(self, path):
+        self.path = path
+        self.title = None # doxygen title
+        self.children = [] # ordered titles
+        self.prev = None
+        self.next = None
+        with open(path, "rt") as f:
+            self.parse(f)
+
+    def parse(self, f):
+        rx_title = re.compile(r"\{#(\w+)\}")
+        rx_subpage = re.compile(r"@subpage\s+(\w+)")
+        rx_prev = re.compile(r"@prev_tutorial\{(\w+)\}")
+        rx_next = re.compile(r"@next_tutorial\{(\w+)\}")
+        for line in f:
+            if self.title is None:
+                m = rx_title.search(line)
+                if m:
+                    self.title = m.group(1)
+                    continue
+            if self.prev is None:
+                m = rx_prev.search(line)
+                if m:
+                    self.prev = m.group(1)
+                    continue
+            if self.next is None:
+                m = rx_next.search(line)
+                if m:
+                    self.next = m.group(1)
+                    continue
+            m = rx_subpage.search(line)
+            if m:
+                self.children.append(m.group(1))
+                continue
+
+    def verify_prev_next(self, storage):
+        res = True
+
+        if self.title is None:
+            print("[W] No title")
+            res = False
+
+        prev = None
+        for one in self.children:
+            c = storage[one]
+            if c.prev is not None and c.prev != prev:
+                print("[W] Wrong prev_tutorial: expected {} / actual {}".format(c.prev, prev))
+                res = False
+            prev = c.title
+
+        next = None
+        for one in reversed(self.children):
+            c = storage[one]
+            if c.next is not None and c.next != next:
+                print("[W] Wrong next_tutorial: expected {} / actual {}".format(c.next, next))
+                res = False
+            next = c.title
+
+        if len(self.children) == 0 and self.prev is None and self.next is None:
+            print("[W] No prev and next tutorials")
+            res = False
+
+        return res
+
+if __name__ == "__main__":
+
+    p = Path('tutorials')
+    print("Looking for tutorials in: '{}'".format(p))
+
+    all_tutorials = dict()
+    for f in p.glob('**/*'):
+        if f.suffix.lower() in ('.markdown', '.md'):
+            t = Tutorial(f)
+            all_tutorials[t.title] = t
+
+    res = 0
+    print("Found: {}".format(len(all_tutorials)))
+    print("------")
+    for title, t in all_tutorials.items():
+        if not t.verify_prev_next(all_tutorials):
+            print("[E] Verification failed: {}".format(t.path))
+            print("------")
+            res = 1
+
+    exit(res)
diff --git a/doc/tutorials/app/_old/table_of_content_highgui.markdown b/doc/tutorials/app/_old/table_of_content_highgui.markdown
new file mode 100644
index 0000000000..3a1705ecd5
--- /dev/null
+++ b/doc/tutorials/app/_old/table_of_content_highgui.markdown
@@ -0,0 +1,4 @@
+High Level GUI and Media (highgui module) {#tutorial_table_of_content_highgui}
+=========================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_app
diff --git a/doc/tutorials/app/_old/table_of_content_imgcodecs.markdown b/doc/tutorials/app/_old/table_of_content_imgcodecs.markdown
new file mode 100644
index 0000000000..a49bbe5cce
--- /dev/null
+++ b/doc/tutorials/app/_old/table_of_content_imgcodecs.markdown
@@ -0,0 +1,4 @@
+Image Input and Output (imgcodecs module) {#tutorial_table_of_content_imgcodecs}
+=========================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_app
diff --git a/doc/tutorials/app/_old/table_of_content_videoio.markdown b/doc/tutorials/app/_old/table_of_content_videoio.markdown
new file mode 100644
index 0000000000..f2b3ccf81c
--- /dev/null
+++ b/doc/tutorials/app/_old/table_of_content_videoio.markdown
@@ -0,0 +1,4 @@
+Video Input and Output (videoio module) {#tutorial_table_of_content_videoio}
+=========================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_app
diff --git a/doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Result_0.jpg b/doc/tutorials/app/images/Adding_Trackbars_Tutorial_Result_0.jpg
similarity index 100%
rename from doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Result_0.jpg
rename to doc/tutorials/app/images/Adding_Trackbars_Tutorial_Result_0.jpg
diff --git a/doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Result_1.jpg b/doc/tutorials/app/images/Adding_Trackbars_Tutorial_Result_1.jpg
similarity index 100%
rename from doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Result_1.jpg
rename to doc/tutorials/app/images/Adding_Trackbars_Tutorial_Result_1.jpg
diff --git a/doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Trackbar.png b/doc/tutorials/app/images/Adding_Trackbars_Tutorial_Trackbar.png
similarity index 100%
rename from doc/tutorials/highgui/trackbar/images/Adding_Trackbars_Tutorial_Trackbar.png
rename to doc/tutorials/app/images/Adding_Trackbars_Tutorial_Trackbar.png
diff --git a/doc/tutorials/videoio/orbbec-astra/images/astra_color.jpg b/doc/tutorials/app/images/astra_color.jpg
similarity index 100%
rename from doc/tutorials/videoio/orbbec-astra/images/astra_color.jpg
rename to doc/tutorials/app/images/astra_color.jpg
diff --git a/doc/tutorials/videoio/orbbec-astra/images/astra_depth.png b/doc/tutorials/app/images/astra_depth.png
similarity index 100%
rename from doc/tutorials/videoio/orbbec-astra/images/astra_depth.png
rename to doc/tutorials/app/images/astra_depth.png
diff --git a/doc/tutorials/imgcodecs/raster-gdal/images/gdal_flood-zone.jpg b/doc/tutorials/app/images/gdal_flood-zone.jpg
similarity index 100%
rename from doc/tutorials/imgcodecs/raster-gdal/images/gdal_flood-zone.jpg
rename to doc/tutorials/app/images/gdal_flood-zone.jpg
diff --git a/doc/tutorials/imgcodecs/raster-gdal/images/gdal_heat-map.jpg b/doc/tutorials/app/images/gdal_heat-map.jpg
similarity index 100%
rename from doc/tutorials/imgcodecs/raster-gdal/images/gdal_heat-map.jpg
rename to doc/tutorials/app/images/gdal_heat-map.jpg
diff --git a/doc/tutorials/imgcodecs/raster-gdal/images/gdal_output.jpg b/doc/tutorials/app/images/gdal_output.jpg
similarity index 100%
rename from doc/tutorials/imgcodecs/raster-gdal/images/gdal_output.jpg
rename to doc/tutorials/app/images/gdal_output.jpg
diff --git a/doc/tutorials/videoio/video-input-psnr-ssim/images/outputVideoInput.png b/doc/tutorials/app/images/outputVideoInput.png
similarity index 100%
rename from doc/tutorials/videoio/video-input-psnr-ssim/images/outputVideoInput.png
rename to doc/tutorials/app/images/outputVideoInput.png
diff --git a/doc/tutorials/videoio/video-write/images/resultOutputWideoWrite.png b/doc/tutorials/app/images/resultOutputWideoWrite.png
similarity index 100%
rename from doc/tutorials/videoio/video-write/images/resultOutputWideoWrite.png
rename to doc/tutorials/app/images/resultOutputWideoWrite.png
diff --git a/doc/tutorials/videoio/video-write/images/videoCompressSelect.png b/doc/tutorials/app/images/videoCompressSelect.png
similarity index 100%
rename from doc/tutorials/videoio/video-write/images/videoCompressSelect.png
rename to doc/tutorials/app/images/videoCompressSelect.png
diff --git a/doc/tutorials/videoio/video-write/images/videoFileStructure.png b/doc/tutorials/app/images/videoFileStructure.png
similarity index 100%
rename from doc/tutorials/videoio/video-write/images/videoFileStructure.png
rename to doc/tutorials/app/images/videoFileStructure.png
diff --git a/doc/tutorials/videoio/intelperc.markdown b/doc/tutorials/app/intelperc.markdown
similarity index 99%
rename from doc/tutorials/videoio/intelperc.markdown
rename to doc/tutorials/app/intelperc.markdown
index 6a6a5e5c9a..5c036a63c2 100644
--- a/doc/tutorials/videoio/intelperc.markdown
+++ b/doc/tutorials/app/intelperc.markdown
@@ -1,6 +1,8 @@
 Using Creative Senz3D and other Intel RealSense SDK compatible depth sensors {#tutorial_intelperc}
 =======================================================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_orbbec_astra}
 
 **Note**: This tutorial is partially obsolete since PerC SDK has been replaced with RealSense SDK
diff --git a/doc/tutorials/videoio/kinect_openni.markdown b/doc/tutorials/app/kinect_openni.markdown
similarity index 99%
rename from doc/tutorials/videoio/kinect_openni.markdown
rename to doc/tutorials/app/kinect_openni.markdown
index aadaec5e44..e235a97755 100644
--- a/doc/tutorials/videoio/kinect_openni.markdown
+++ b/doc/tutorials/app/kinect_openni.markdown
@@ -1,6 +1,8 @@
 Using Kinect and other OpenNI compatible depth sensors {#tutorial_kinect_openni}
 ======================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_video_write}
 @next_tutorial{tutorial_orbbec_astra}
 
diff --git a/doc/tutorials/videoio/orbbec-astra/orbbec_astra.markdown b/doc/tutorials/app/orbbec_astra.markdown
similarity index 64%
rename from doc/tutorials/videoio/orbbec-astra/orbbec_astra.markdown
rename to doc/tutorials/app/orbbec_astra.markdown
index 664e4f6dfe..273c3c3536 100644
--- a/doc/tutorials/videoio/orbbec-astra/orbbec_astra.markdown
+++ b/doc/tutorials/app/orbbec_astra.markdown
@@ -1,6 +1,8 @@
 Using Orbbec Astra 3D cameras {#tutorial_orbbec_astra}
 ======================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_kinect_openni}
 @next_tutorial{tutorial_intelperc}
 
@@ -9,12 +11,12 @@ Using Orbbec Astra 3D cameras {#tutorial_orbbec_astra}
 
 This tutorial is devoted to the Astra Series of Orbbec 3D cameras (https://orbbec3d.com/product-astra-pro/).
 That cameras have a depth sensor in addition to a common color sensor. The depth sensors can be read using
-the OpenNI interface with @ref cv::VideoCapture class. The video stream is provided through the regular camera
-interface.
+the open source OpenNI API with @ref cv::VideoCapture class. The video stream is provided through the regular
+camera interface.
 
 ### Installation Instructions
 
-In order to use a depth sensor with OpenCV you should do the following steps:
+In order to use the Astra camera's depth sensor with OpenCV you should do the following steps:
 
 -#  Download the latest version of Orbbec OpenNI SDK (from here <https://orbbec3d.com/develop/>).
     Unzip the archive, choose the build according to your operating system and follow installation
@@ -70,24 +72,32 @@ In order to use a depth sensor with OpenCV you should do the following steps:
 
 ### Code
 
-To get both depth and color frames, two @ref cv::VideoCapture objects should be created:
+The Astra Pro camera has two sensors -- a depth sensor and a color sensor. The depth sensor
+can be read using the OpenNI interface with @ref cv::VideoCapture class. The video stream is
+not available through OpenNI API and is only provided via the regular camera interface.
+So, to get both depth and color frames, two @ref cv::VideoCapture objects should be created:
 
 @snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Open streams
 
-The first object will use the regular Video4Linux2 interface to access the color sensor. The second one
-is using OpenNI2 API to retrieve depth data.
+The first object will use the OpenNI2 API to retrieve depth data. The second one uses the
+Video4Linux2 interface to access the color sensor. Note that the example above assumes that
+the Astra camera is the first camera in the system. If you have more than one camera connected,
+you may need to explicitly set the proper camera number.
 
-Before using the created VideoCapture objects you may want to setup stream parameters by setting
-objects' properties. The most important parameters are frame width, frame height and fps:
+Before using the created VideoCapture objects you may want to set up stream parameters by setting
+objects' properties. The most important parameters are frame width, frame height and fps.
+For this example, we’ll configure width and height of both streams to VGA resolution, which is
+the maximum resolution available for both sensors, and we’d like both stream parameters to be the
+same for easier color-to-depth data registration:
 
 @snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Setup streams
 
-For setting and getting some property of sensor data generators use @ref cv::VideoCapture::set and
+For setting and retrieving some property of sensor data generators use @ref cv::VideoCapture::set and
 @ref cv::VideoCapture::get methods respectively, e.g. :
 
 @snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Get properties
 
-The following properties of cameras available through OpenNI interfaces are supported for the depth
+The following properties of cameras available through OpenNI interface are supported for the depth
 generator:
 
 -   @ref cv::CAP_PROP_FRAME_WIDTH -- Frame width in pixels.
@@ -106,15 +116,16 @@ generator:
 -   @ref cv::CAP_PROP_OPENNI_FRAME_MAX_DEPTH -- A maximum supported depth of the camera in mm.
 -   @ref cv::CAP_PROP_OPENNI_BASELINE -- Baseline value in mm.
 
-After the VideoCapture objects are set up you can start reading frames from them.
+After the VideoCapture objects have been set up, you can start reading frames from them.
 
 @note
     OpenCV's VideoCapture provides synchronous API, so you have to grab frames in a new thread
     to avoid one stream blocking while another stream is being read. VideoCapture is not a
     thread-safe class, so you need to be careful to avoid any possible deadlocks or data races.
 
-Example implementation that gets frames from each sensor in a new thread and stores them
-in a list along with their timestamps:
+As there are two video sources that should be read simultaneously, it’s necessary to create two
+threads to avoid blocking. Example implementation that gets frames from each sensor in a new thread
+and stores them in a list along with their timestamps:
 
 @snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Read streams
 
@@ -130,17 +141,25 @@ VideoCapture can retrieve the following data:
 
 -#  data given from the color sensor is a regular BGR image (CV_8UC3).
 
-When new data is available a reading thread notifies the main thread. A frame is stored in the
-ordered list -- the first frame is the latest one:
+When new data are available, each reading thread notifies the main thread using a condition variable.
+A frame is stored in the ordered list -- the first frame in the list is the earliest captured,
+the last frame is the latest captured. As depth and color frames are read from independent sources
+two video streams may become out of sync even when both streams are set up for the same frame rate.
+A post-synchronization procedure can be applied to the streams to combine depth and color frames into
+pairs. The sample code below demonstrates this procedure:
 
-@snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Show color frame
+@snippetlineno samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp Pair frames
 
-Depth frames can be picked the same way from the `depthFrames` list.
+In the code snippet above the execution is blocked until there are some frames in both frame lists.
+When there are new frames, their timestamps are being checked -- if they differ more than a half of
+the frame period then one of the frames is dropped. If timestamps are close enough, then two frames
+are paired. Now, we have two frames: one containing color information and another one -- depth information.
+In the example above retrieved frames are simply shown with cv::imshow function, but you can insert
+any other processing code here.
 
-After that, you'll have two frames: one containing color information and another one -- depth
-information. In the sample images below you can see the color frame and the depth frame showing
-the same scene. Looking at the color frame it's hard to distinguish plant leaves from leaves painted
-on a wall, but the depth data makes it easy.
+In the sample images below you can see the color frame and the depth frame representing the same scene.
+Looking at the color frame it's hard to distinguish plant leaves from leaves painted on a wall,
+but the depth data makes it easy.
 
 ![Color frame](images/astra_color.jpg)
 ![Depth frame](images/astra_depth.png)
diff --git a/doc/tutorials/imgcodecs/raster-gdal/raster_io_gdal.markdown b/doc/tutorials/app/raster_io_gdal.markdown
similarity index 95%
rename from doc/tutorials/imgcodecs/raster-gdal/raster_io_gdal.markdown
rename to doc/tutorials/app/raster_io_gdal.markdown
index 432caa69e0..73574cdccd 100644
--- a/doc/tutorials/imgcodecs/raster-gdal/raster_io_gdal.markdown
+++ b/doc/tutorials/app/raster_io_gdal.markdown
@@ -1,6 +1,16 @@
 Reading Geospatial Raster files with GDAL {#tutorial_raster_io_gdal}
 =========================================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_trackbar}
+@next_tutorial{tutorial_video_input_psnr_ssim}
+
+|    |    |
+| -: | :- |
+| Original author | Marvin Smith |
+| Compatibility | OpenCV >= 3.0 |
+
 Geospatial raster data is a heavily used product in Geographic Information Systems and
 Photogrammetry. Raster data typically can represent imagery and Digital Elevation Models (DEM). The
 standard library for loading GIS imagery is the Geographic Data Abstraction Library [(GDAL)](http://www.gdal.org). In this
diff --git a/doc/tutorials/app/table_of_content_app.markdown b/doc/tutorials/app/table_of_content_app.markdown
new file mode 100644
index 0000000000..8e05dfaf07
--- /dev/null
+++ b/doc/tutorials/app/table_of_content_app.markdown
@@ -0,0 +1,10 @@
+Application utils (highgui, imgcodecs, videoio modules) {#tutorial_table_of_content_app}
+=======================================================
+
+-   @subpage tutorial_trackbar
+-   @subpage tutorial_raster_io_gdal
+-   @subpage tutorial_video_input_psnr_ssim
+-   @subpage tutorial_video_write
+-   @subpage tutorial_kinect_openni
+-   @subpage tutorial_orbbec_astra
+-   @subpage tutorial_intelperc
diff --git a/doc/tutorials/highgui/trackbar/trackbar.markdown b/doc/tutorials/app/trackbar.markdown
similarity index 96%
rename from doc/tutorials/highgui/trackbar/trackbar.markdown
rename to doc/tutorials/app/trackbar.markdown
index d6700d6387..2b88114a6b 100644
--- a/doc/tutorials/highgui/trackbar/trackbar.markdown
+++ b/doc/tutorials/app/trackbar.markdown
@@ -1,6 +1,16 @@
 Adding a Trackbar to our applications! {#tutorial_trackbar}
 ======================================
 
+@tableofcontents
+
+@next_tutorial{tutorial_raster_io_gdal}
+
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
+
 -   In the previous tutorials (about @ref tutorial_adding_images and the @ref tutorial_basic_linear_transform)
     you might have noted that we needed to give some **input** to our programs, such
     as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal.
diff --git a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown b/doc/tutorials/app/video_input_psnr_ssim.markdown
similarity index 98%
rename from doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
rename to doc/tutorials/app/video_input_psnr_ssim.markdown
index 76cfa3751d..e212c4e46d 100644
--- a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
+++ b/doc/tutorials/app/video_input_psnr_ssim.markdown
@@ -1,8 +1,16 @@
 Video Input with OpenCV and similarity measurement {#tutorial_video_input_psnr_ssim}
 ==================================================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_raster_io_gdal}
 @next_tutorial{tutorial_video_write}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/videoio/video-write/video_write.markdown b/doc/tutorials/app/video_write.markdown
similarity index 97%
rename from doc/tutorials/videoio/video-write/video_write.markdown
rename to doc/tutorials/app/video_write.markdown
index 0100f8cfc4..d655e24b89 100644
--- a/doc/tutorials/videoio/video-write/video_write.markdown
+++ b/doc/tutorials/app/video_write.markdown
@@ -1,9 +1,16 @@
 Creating a video with OpenCV {#tutorial_video_write}
 ============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_video_input_psnr_ssim}
 @next_tutorial{tutorial_kinect_openni}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
@@ -59,7 +66,7 @@ extension, its first version. A direct limitation of this is that you cannot sav
 larger than 2 GB. Furthermore you can only create and expand a single video track inside the
 container. No audio or other track editing support here. Nevertheless, any video codec present on
 your system might work. If you encounter some of these limitations you will need to look into more
-specialized video writing libraries such as *FFMpeg* or codecs as *HuffYUV*, *CorePNG* and *LCL*. As
+specialized video writing libraries such as *FFmpeg* or codecs as *HuffYUV*, *CorePNG* and *LCL*. As
 an alternative, create the video track with OpenCV and expand it with sound tracks or convert it to
 other formats by using video manipulation programs such as *VirtualDub* or *AviSynth*.
 
@@ -109,7 +116,7 @@ const string NAME = source.substr(0, pAt) + argv[2][0] + ".avi";   // Form the n
     @code{.cpp}
     CV_FOURCC('P','I','M,'1') // this is an MPEG1 codec from the characters to integer
     @endcode
-    If you pass for this argument minus one than a window will pop up at runtime that contains all
+    If you pass for this argument minus one then a window will pop up at runtime that contains all
     the codec installed on your system and ask you to select the one to use:
 
     ![](images/videoCompressSelect.png)
diff --git a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
index 90298124c7..00e1e9668f 100644
--- a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
+++ b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
@@ -1,9 +1,16 @@
 Camera calibration With OpenCV {#tutorial_camera_calibration}
 ==============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_camera_calibration_square_chess}
 @next_tutorial{tutorial_real_time_pose}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 4.0 |
+
 
 Cameras have been around for a long-long time. However, with the introduction of the cheap *pinhole*
 cameras in the late 20th century, they became a common occurrence in our everyday life.
diff --git a/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown b/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown
index d6df8a8b5e..c87f9f95f8 100644
--- a/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown
+++ b/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown
@@ -1,8 +1,15 @@
 Create calibration pattern {#tutorial_camera_calibration_pattern}
 =========================================
 
+@tableofcontents
+
 @next_tutorial{tutorial_camera_calibration_square_chess}
 
+|    |    |
+| -: | :- |
+| Original author | Laurent Berger |
+| Compatibility | OpenCV >= 3.0 |
+
 
 The goal of this tutorial is to learn how to create calibration pattern.
 
diff --git a/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown b/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown
index 51b0a5eac7..b278bb87ac 100644
--- a/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown
+++ b/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown
@@ -1,9 +1,16 @@
 Camera calibration with square chessboard {#tutorial_camera_calibration_square_chess}
 =========================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_camera_calibration_pattern}
 @next_tutorial{tutorial_camera_calibration}
 
+|    |    |
+| -: | :- |
+| Original author | Victor Eruhimov |
+| Compatibility | OpenCV >= 4.0 |
+
 
 The goal of this tutorial is to learn how to calibrate a camera given a set of chessboard images.
 
diff --git a/doc/tutorials/calib3d/images/camera_calibration.png b/doc/tutorials/calib3d/images/camera_calibration.png
deleted file mode 100644
index b010459c9d..0000000000
Binary files a/doc/tutorials/calib3d/images/camera_calibration.png and /dev/null differ
diff --git a/doc/tutorials/calib3d/images/camera_calibration_square_chess.jpg b/doc/tutorials/calib3d/images/camera_calibration_square_chess.jpg
deleted file mode 100644
index 1fcab0f83c..0000000000
Binary files a/doc/tutorials/calib3d/images/camera_calibration_square_chess.jpg and /dev/null differ
diff --git a/doc/tutorials/calib3d/images/real_time_pose_estimation.jpg b/doc/tutorials/calib3d/images/real_time_pose_estimation.jpg
deleted file mode 100644
index dcd24cc791..0000000000
Binary files a/doc/tutorials/calib3d/images/real_time_pose_estimation.jpg and /dev/null differ
diff --git a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
index 36e19e0754..3c4f0b0c83 100644
--- a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
+++ b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
@@ -1,8 +1,15 @@
 Interactive camera calibration application {#tutorial_interactive_calibration}
 ==============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_real_time_pose}
 
+|    |    |
+| -: | :- |
+| Original author | Vladislav Sovrasov |
+| Compatibility | OpenCV >= 3.1 |
+
 
 According to classical calibration technique user must collect all data first and when run @ref cv::calibrateCamera function
 to obtain camera parameters. If average re-projection error is huge or if estimated parameters seems to be wrong, process of
diff --git a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
index 9888d29230..58419f8618 100644
--- a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
+++ b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
@@ -1,9 +1,16 @@
 Real Time pose estimation of a textured object {#tutorial_real_time_pose}
 ==============================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_camera_calibration}
 @next_tutorial{tutorial_interactive_calibration}
 
+|    |    |
+| -: | :- |
+| Original author | Edgar Riba |
+| Compatibility | OpenCV >= 3.0 |
+
 
 Nowadays, augmented reality is one of the top research topic in computer vision and robotics fields.
 The most elemental problem in augmented reality is the estimation of the camera pose respect of an
diff --git a/doc/tutorials/calib3d/table_of_content_calib3d.markdown b/doc/tutorials/calib3d/table_of_content_calib3d.markdown
index 3861d448b7..5fc6e591e9 100644
--- a/doc/tutorials/calib3d/table_of_content_calib3d.markdown
+++ b/doc/tutorials/calib3d/table_of_content_calib3d.markdown
@@ -1,58 +1,8 @@
 Camera calibration and 3D reconstruction (calib3d module) {#tutorial_table_of_content_calib3d}
 ==========================================================
 
-Although we get most of our images in a 2D format they do come from a 3D world. Here you will learn how to find out 3D world information from 2D images.
-
 -   @subpage tutorial_camera_calibration_pattern
-
-    *Languages:* Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Laurent Berger
-
-    You will learn how to create some calibration pattern.
-
 -   @subpage tutorial_camera_calibration_square_chess
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Victor Eruhimov
-
-    You will use some chessboard images to calibrate your camera.
-
 -   @subpage tutorial_camera_calibration
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 4.0
-
-    *Author:* Bernát Gábor
-
-    Camera calibration by using either the chessboard, circle or the asymmetrical circle
-    pattern. Get the images either from a camera attached, a video file or from an image
-    collection.
-
 -   @subpage tutorial_real_time_pose
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Edgar Riba
-
-    Real time pose estimation of a textured object using ORB features, FlannBased matcher, PnP
-    approach plus Ransac and Linear Kalman Filter to reject possible bad poses.
-
 -   @subpage tutorial_interactive_calibration
-
-    *Compatibility:* \> OpenCV 3.1
-
-    *Author:* Vladislav Sovrasov
-
-    Camera calibration by using either the chessboard, chAruco, asymmetrical circle or dual asymmetrical circle
-    pattern. Calibration process is continuous, so you can see results after each new pattern shot.
-    As an output you get average reprojection error, intrinsic camera parameters, distortion coefficients and
-     confidence intervals for all of evaluated variables.
diff --git a/doc/tutorials/core/adding_images/adding_images.markdown b/doc/tutorials/core/adding_images/adding_images.markdown
index c8776325a3..3cec9f1734 100644
--- a/doc/tutorials/core/adding_images/adding_images.markdown
+++ b/doc/tutorials/core/adding_images/adding_images.markdown
@@ -1,9 +1,17 @@
 Adding (blending) two images using OpenCV {#tutorial_adding_images}
 =========================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_mat_operations}
 @next_tutorial{tutorial_basic_linear_transform}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
+We will learn how to blend two images!
 Goal
 ----
 
diff --git a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
index 1eac760a4c..75bd655272 100644
--- a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
+++ b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
@@ -1,9 +1,16 @@
 Changing the contrast and brightness of an image! {#tutorial_basic_linear_transform}
 =================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_adding_images}
 @next_tutorial{tutorial_discrete_fourier_transform}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown b/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
index 53ef27258d..1701babf4f 100644
--- a/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
+++ b/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
@@ -1,9 +1,16 @@
 Discrete Fourier Transform {#tutorial_discrete_fourier_transform}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_basic_linear_transform}
 @next_tutorial{tutorial_file_input_output_with_xml_yml}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown b/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
index b87ec79ff7..da060cf27d 100644
--- a/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
+++ b/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
@@ -1,9 +1,16 @@
 File Input and Output using XML and YAML files {#tutorial_file_input_output_with_xml_yml}
 ==============================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_discrete_fourier_transform}
 @next_tutorial{tutorial_how_to_use_OpenCV_parallel_for_}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
index c5028d6a3a..d19936ecbe 100644
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
@@ -1,9 +1,16 @@
 How to scan images, lookup tables and time measurement with OpenCV {#tutorial_how_to_scan_images}
 ==================================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_mat_the_basic_image_container}
 @next_tutorial{tutorial_mat_mask_operations}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
index 80cc6c68fe..92f73b77e8 100644
--- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@@ -1,8 +1,14 @@
 How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_use_OpenCV_parallel_for_}
 ==================================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_file_input_output_with_xml_yml}
 
+|    |    |
+| -: | :- |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/images/Adding_Images_Tutorial_Result_0.jpg b/doc/tutorials/core/images/Adding_Images_Tutorial_Result_0.jpg
deleted file mode 100644
index 940b54c82f..0000000000
Binary files a/doc/tutorials/core/images/Adding_Images_Tutorial_Result_0.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/Basic_Linear_Transform_Tutorial_Result_0.jpg b/doc/tutorials/core/images/Basic_Linear_Transform_Tutorial_Result_0.jpg
deleted file mode 100644
index eccf37aa20..0000000000
Binary files a/doc/tutorials/core/images/Basic_Linear_Transform_Tutorial_Result_0.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/Drawing_1_Tutorial_Result_0.jpg b/doc/tutorials/core/images/Drawing_1_Tutorial_Result_0.jpg
deleted file mode 100644
index 05e8f01232..0000000000
Binary files a/doc/tutorials/core/images/Drawing_1_Tutorial_Result_0.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/Drawing_2_Tutorial_Result_7.jpg b/doc/tutorials/core/images/Drawing_2_Tutorial_Result_7.jpg
deleted file mode 100644
index d650c18427..0000000000
Binary files a/doc/tutorials/core/images/Drawing_2_Tutorial_Result_7.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/Morphology_1_Tutorial_Cover.jpg b/doc/tutorials/core/images/Morphology_1_Tutorial_Cover.jpg
deleted file mode 100644
index 71509ba5b8..0000000000
Binary files a/doc/tutorials/core/images/Morphology_1_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/Smoothing_Tutorial_Cover.jpg b/doc/tutorials/core/images/Smoothing_Tutorial_Cover.jpg
deleted file mode 100644
index c11f2ed024..0000000000
Binary files a/doc/tutorials/core/images/Smoothing_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/discrete_fourier_transform.png b/doc/tutorials/core/images/discrete_fourier_transform.png
deleted file mode 100644
index 07bd1119f4..0000000000
Binary files a/doc/tutorials/core/images/discrete_fourier_transform.png and /dev/null differ
diff --git a/doc/tutorials/core/images/file_input_output_with_xml_yml.png b/doc/tutorials/core/images/file_input_output_with_xml_yml.png
deleted file mode 100644
index 24ae4fdd23..0000000000
Binary files a/doc/tutorials/core/images/file_input_output_with_xml_yml.png and /dev/null differ
diff --git a/doc/tutorials/core/images/howToScanImages.jpg b/doc/tutorials/core/images/howToScanImages.jpg
deleted file mode 100644
index 4e0fa26d0d..0000000000
Binary files a/doc/tutorials/core/images/howToScanImages.jpg and /dev/null differ
diff --git a/doc/tutorials/core/images/interopOpenCV1.png b/doc/tutorials/core/images/interopOpenCV1.png
deleted file mode 100644
index 040f50a003..0000000000
Binary files a/doc/tutorials/core/images/interopOpenCV1.png and /dev/null differ
diff --git a/doc/tutorials/core/images/matMaskFilter2DOp.png b/doc/tutorials/core/images/matMaskFilter2DOp.png
deleted file mode 100644
index 6795921608..0000000000
Binary files a/doc/tutorials/core/images/matMaskFilter2DOp.png and /dev/null differ
diff --git a/doc/tutorials/core/images/matTheBasicImageStructure.jpg b/doc/tutorials/core/images/matTheBasicImageStructure.jpg
deleted file mode 100644
index ab6704a3c9..0000000000
Binary files a/doc/tutorials/core/images/matTheBasicImageStructure.jpg and /dev/null differ
diff --git a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
index fedb123ae6..43c71d7159 100644
--- a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
+++ b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
@@ -1,9 +1,16 @@
 Mask operations on matrices {#tutorial_mat_mask_operations}
 ===========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_how_to_scan_images}
 @next_tutorial{tutorial_mat_operations}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Mask operations on matrices are quite simple. The idea is that we recalculate each pixel's value in
 an image according to a mask matrix (also known as kernel). This mask holds values that will adjust
 how much influence neighboring pixels (and the current pixel) have on the new pixel value. From a
diff --git a/doc/tutorials/core/mat_operations.markdown b/doc/tutorials/core/mat_operations.markdown
index 991d01367b..331a847551 100644
--- a/doc/tutorials/core/mat_operations.markdown
+++ b/doc/tutorials/core/mat_operations.markdown
@@ -1,9 +1,15 @@
 Operations with images {#tutorial_mat_operations}
 ======================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_mat_mask_operations}
 @next_tutorial{tutorial_adding_images}
 
+|    |    |
+| -: | :- |
+| Compatibility | OpenCV >= 3.0 |
+
 Input/Output
 ------------
 
diff --git a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
index 573e112d61..4f6f2b8a88 100644
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
@@ -1,8 +1,15 @@
 Mat - The Basic Image Container {#tutorial_mat_the_basic_image_container}
 ===============================
 
+@tableofcontents
+
 @next_tutorial{tutorial_how_to_scan_images}
 
+|    |    |
+| -: | :- |
+| Original author | Bernát Gábor |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/table_of_content_core.markdown b/doc/tutorials/core/table_of_content_core.markdown
index c607d4c02c..4cd77fcdfc 100644
--- a/doc/tutorials/core/table_of_content_core.markdown
+++ b/doc/tutorials/core/table_of_content_core.markdown
@@ -1,97 +1,12 @@
 The Core Functionality (core module) {#tutorial_table_of_content_core}
 =====================================
 
-Here you will learn the about the basic building blocks of the library. A must read and know for
-understanding how to manipulate the images on a pixel level.
-
 -   @subpage tutorial_mat_the_basic_image_container
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You will learn how to store images in the memory and how to print out their content to the
-    console.
-
 -   @subpage tutorial_how_to_scan_images
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You'll find out how to scan images (go through each of the image pixels) with OpenCV.
-    Bonus: time measurement with OpenCV.
-
-
 -   @subpage tutorial_mat_mask_operations
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You'll find out how to scan images with neighbor access and use the @ref cv::filter2D
-    function to apply kernel filters on images.
-
 -   @subpage tutorial_mat_operations
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    Reading/writing images from file, accessing pixels, primitive operations, visualizing images.
-
 -   @subpage tutorial_adding_images
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    We will learn how to blend two images!
-
 -   @subpage tutorial_basic_linear_transform
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    We will learn how to change our image appearance!
-
 -   @subpage tutorial_discrete_fourier_transform
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You will see how and why use the Discrete Fourier transformation with OpenCV.
-
-
 -   @subpage tutorial_file_input_output_with_xml_yml
-
-    *Languages:* C++, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You will see how to use the @ref cv::FileStorage data structure of OpenCV to write and read
-    data to XML or YAML file format.
-
 -   @subpage tutorial_how_to_use_OpenCV_parallel_for_
-
-    *Languages:* C++
-
-    *Compatibility:* \>= OpenCV 2.4.3
-
-    You will see how to use the OpenCV parallel_for_ to easily parallelize your code.
diff --git a/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown b/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
index 43c86acaf0..48a55992c6 100644
--- a/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
+++ b/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
@@ -1,6 +1,14 @@
 # How to run custom OCR model {#tutorial_dnn_OCR}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_custom_layers}
+@next_tutorial{tutorial_dnn_text_spotting}
+
+|    |    |
+| -: | :- |
+| Original author | Zihao Mu |
+| Compatibility | OpenCV >= 4.3 |
 
 ## Introduction
 
@@ -43,4 +51,4 @@ The input of text recognition model is the output of the text detection model, w
 
 DenseNet_CTC has the smallest parameters and best FPS, and it is suitable for edge devices, which are very sensitive to the cost of calculation. If you have limited computing resources and want to achieve better accuracy, VGG_CTC is a good choice.
 
-CRNN_VGG_BiLSTM_CTC is suitable for scenarios that require high recognition accuracy.
\ No newline at end of file
+CRNN_VGG_BiLSTM_CTC is suitable for scenarios that require high recognition accuracy.
diff --git a/doc/tutorials/dnn/dnn_android/dnn_android.markdown b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
index 04520245da..4eb1ff238e 100644
--- a/doc/tutorials/dnn/dnn_android/dnn_android.markdown
+++ b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
@@ -1,8 +1,15 @@
 # How to run deep networks on Android device {#tutorial_dnn_android}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_halide_scheduling}
 @next_tutorial{tutorial_dnn_yolo}
 
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 3.3 |
+
 ## Introduction
 In this tutorial you'll know how to run deep learning networks on Android device
 using OpenCV deep learning module.
diff --git a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
index feed5aaf76..07c3fb4a7f 100644
--- a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
+++ b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
@@ -1,8 +1,15 @@
 # Custom deep learning layers support {#tutorial_dnn_custom_layers}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_javascript}
 @next_tutorial{tutorial_dnn_OCR}
 
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 3.4.1 |
+
 ## Introduction
 Deep learning is a fast growing area. The new approaches to build neural networks
 usually introduce new types of layers. They could be modifications of existing
diff --git a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
index f6040dce1c..a886e9e089 100644
--- a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
+++ b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
@@ -1,8 +1,15 @@
 Load Caffe framework models  {#tutorial_dnn_googlenet}
 ===========================
 
+@tableofcontents
+
 @next_tutorial{tutorial_dnn_halide}
 
+|    |    |
+| -: | :- |
+| Original author | Vitaliy Lyudvichenko |
+| Compatibility | OpenCV >= 3.3 |
+
 Introduction
 ------------
 
diff --git a/doc/tutorials/dnn/dnn_halide/dnn_halide.markdown b/doc/tutorials/dnn/dnn_halide/dnn_halide.markdown
index 0500d25150..84ab50c193 100644
--- a/doc/tutorials/dnn/dnn_halide/dnn_halide.markdown
+++ b/doc/tutorials/dnn/dnn_halide/dnn_halide.markdown
@@ -1,8 +1,15 @@
 # How to enable Halide backend for improve efficiency  {#tutorial_dnn_halide}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_googlenet}
 @next_tutorial{tutorial_dnn_halide_scheduling}
 
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 3.3 |
+
 ## Introduction
 This tutorial guidelines how to run your models in OpenCV deep learning module
 using Halide language backend. Halide is an open-source project that let us
diff --git a/doc/tutorials/dnn/dnn_halide_scheduling/dnn_halide_scheduling.markdown b/doc/tutorials/dnn/dnn_halide_scheduling/dnn_halide_scheduling.markdown
index b825da7922..e4a6f1fecc 100644
--- a/doc/tutorials/dnn/dnn_halide_scheduling/dnn_halide_scheduling.markdown
+++ b/doc/tutorials/dnn/dnn_halide_scheduling/dnn_halide_scheduling.markdown
@@ -1,8 +1,15 @@
 # How to schedule your network for Halide backend {#tutorial_dnn_halide_scheduling}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_halide}
 @next_tutorial{tutorial_dnn_android}
 
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 3.3 |
+
 ## Introduction
 Halide code is the same for every device we use. But for achieving the satisfied
 efficiency we should schedule computations properly. In this tutorial we describe
diff --git a/doc/tutorials/dnn/dnn_javascript/dnn_javascript.markdown b/doc/tutorials/dnn/dnn_javascript/dnn_javascript.markdown
index 9ad632fbc8..49b6f33adb 100644
--- a/doc/tutorials/dnn/dnn_javascript/dnn_javascript.markdown
+++ b/doc/tutorials/dnn/dnn_javascript/dnn_javascript.markdown
@@ -1,8 +1,15 @@
 # How to run deep networks in browser {#tutorial_dnn_javascript}
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_yolo}
 @next_tutorial{tutorial_dnn_custom_layers}
 
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 3.3.1 |
+
 ## Introduction
 This tutorial will show us how to run deep learning models using OpenCV.js right
 in a browser. Tutorial refers a sample of face detection and face recognition
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/opencv_resnet50_test_res_c.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/opencv_resnet50_test_res_c.jpg
new file mode 100644
index 0000000000..4d1ba30378
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/opencv_resnet50_test_res_c.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/pytorch_resnet50_opencv_test_res.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/pytorch_resnet50_opencv_test_res.jpg
new file mode 100644
index 0000000000..7bee270616
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/pytorch_resnet50_opencv_test_res.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/squirrel_cls.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/squirrel_cls.jpg
new file mode 100644
index 0000000000..289b13bbd3
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/squirrel_cls.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/tf_mobilenet_opencv_test_res.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/tf_mobilenet_opencv_test_res.jpg
new file mode 100644
index 0000000000..cc18156760
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_classification/images/tf_mobilenet_opencv_test_res.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_c_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_c_tutorial.md
new file mode 100644
index 0000000000..1807caf0b4
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_c_tutorial.md
@@ -0,0 +1,220 @@
+# Conversion of PyTorch Classification Models and Launch with OpenCV C++ {#pytorch_cls_c_tutorial_dnn_conversion}
+
+@prev_tutorial{pytorch_cls_tutorial_dnn_conversion}
+
+|    |    |
+| -: | :- |
+| Original author | Anastasia Murzova |
+| Compatibility | OpenCV >= 4.5 |
+
+## Goals
+In this tutorial you will learn how to:
+* convert PyTorch classification models into ONNX format
+* run converted PyTorch model with OpenCV C/C++ API
+* provide model inference
+
+We will explore the above-listed points by the example of ResNet-50 architecture.
+
+## Introduction
+Let's briefly view the key concepts involved in the pipeline of PyTorch models transition with OpenCV API. The initial step in conversion of PyTorch models into cv::dnn::Net
+is model transferring into [ONNX](https://onnx.ai/about.html) format. ONNX aims at the interchangeability of the neural networks between various frameworks. There is a built-in function in PyTorch for ONNX conversion: [``torch.onnx.export``](https://pytorch.org/docs/stable/onnx.html#torch.onnx.export).
+Further the obtained ``.onnx`` model is passed into cv::dnn::readNetFromONNX or cv::dnn::readNet.
+
+## Requirements
+To be able to experiment with the below code you will need to install a set of libraries. We will use a virtual environment with python3.7+ for this:
+
+```console
+virtualenv -p /usr/bin/python3.7 <env_dir_path>
+source <env_dir_path>/bin/activate
+```
+
+For OpenCV-Python building from source, follow the corresponding instructions from the @ref tutorial_py_table_of_contents_setup.
+
+Before you start the installation of the libraries, you can customize the [requirements.txt](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt), excluding or including (for example, ``opencv-python``) some dependencies.
+The below line initiates requirements installation into the previously activated virtual environment:
+
+```console
+pip install -r requirements.txt
+```
+
+## Practice
+In this part we are going to cover the following points:
+1. create a classification model conversion pipeline
+2. provide the inference, process prediction results
+
+### Model Conversion Pipeline
+The code in this subchapter is located in the ``samples/dnn/dnn_model_runner`` module and can be executed with the line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_resnet50_onnx
+```
+
+The following code contains the description of the below-listed steps:
+1. instantiate PyTorch model
+2. convert PyTorch model into ``.onnx``
+
+```python
+# initialize PyTorch ResNet-50 model
+original_model = models.resnet50(pretrained=True)
+
+# get the path to the converted into ONNX PyTorch model
+full_model_path = get_pytorch_onnx_model(original_model)
+print("PyTorch ResNet-50 model was successfully converted: ", full_model_path)
+```
+
+``get_pytorch_onnx_model(original_model)`` function is based on ``torch.onnx.export(...)`` call:
+
+```python
+# define the directory for further converted model save
+onnx_model_path = "models"
+# define the name of further converted model
+onnx_model_name = "resnet50.onnx"
+
+# create directory for further converted model
+os.makedirs(onnx_model_path, exist_ok=True)
+
+# get full path to the converted model
+full_model_path = os.path.join(onnx_model_path, onnx_model_name)
+
+# generate model input
+generated_input = Variable(
+    torch.randn(1, 3, 224, 224)
+)
+
+# model export into ONNX format
+torch.onnx.export(
+    original_model,
+    generated_input,
+    full_model_path,
+    verbose=True,
+    input_names=["input"],
+    output_names=["output"],
+    opset_version=11
+)
+```
+
+After the successful execution of the above code we will get the following output:
+
+```console
+PyTorch ResNet-50 model was successfully converted: models/resnet50.onnx
+```
+
+The proposed in ``dnn/samples`` module ``dnn_model_runner`` allows us to reproduce the above conversion steps for the following PyTorch classification models:
+* alexnet
+* vgg11
+* vgg13
+* vgg16
+* vgg19
+* resnet18
+* resnet34
+* resnet50
+* resnet101
+* resnet152
+* squeezenet1_0
+* squeezenet1_1
+* resnext50_32x4d
+* resnext101_32x8d
+* wide_resnet50_2
+* wide_resnet101_2
+
+To obtain the converted model, the following line should be executed:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name <pytorch_cls_model_name> --evaluate False
+```
+
+For the ResNet-50 case the below line should be run:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name resnet50 --evaluate False
+```
+
+The default root directory for the converted model storage is defined in module ``CommonConfig``:
+
+```python
+@dataclass
+class CommonConfig:
+    output_data_root_dir: str = "dnn_model_runner/dnn_conversion"
+```
+
+Thus, the converted ResNet-50 will be saved in ``dnn_model_runner/dnn_conversion/models``.
+
+### Inference Pipeline
+Now we can use ```models/resnet50.onnx``` for the inference pipeline using OpenCV C/C++ API. The implemented pipeline can be found in [samples/dnn/classification.cpp](https://github.com/opencv/opencv/blob/master/samples/dnn/classification.cpp).
+After the build of samples (``BUILD_EXAMPLES`` flag value should be ``ON``), the appropriate ``example_dnn_classification`` executable file will be provided.
+
+To provide model inference we will use the below [squirrel photo](https://www.pexels.com/photo/brown-squirrel-eating-1564292) (under [CC0](https://www.pexels.com/terms-of-service/) license) corresponding to ImageNet class ID 335:
+```console
+fox squirrel, eastern fox squirrel, Sciurus niger
+```
+
+![Classification model input image](images/squirrel_cls.jpg)
+
+For the label decoding of the obtained prediction, we also need ``imagenet_classes.txt`` file, which contains the full list of the ImageNet classes.
+
+In this tutorial we will run the inference process for the converted PyTorch ResNet-50 model from the build (``samples/build``) directory:
+
+```
+./dnn/example_dnn_classification --model=../dnn/models/resnet50.onnx --input=../data/squirrel_cls.jpg --width=224 --height=224 --rgb=true --scale="0.003921569" --mean="123.675 116.28 103.53" --std="0.229 0.224 0.225" --crop=true --initial_width=256 --initial_height=256 --classes=../data/dnn/classification_classes_ILSVRC2012.txt
+```
+
+Let's explore ``classification.cpp`` key points step by step:
+
+1. read the model with cv::dnn::readNet, initialize the network:
+
+```cpp
+Net net = readNet(model, config, framework);
+```
+
+The ``model`` parameter value is taken from ``--model`` key. In our case, it is ``resnet50.onnx``.
+
+* preprocess input image:
+
+```cpp
+if (rszWidth != 0 && rszHeight != 0)
+{
+    resize(frame, frame, Size(rszWidth, rszHeight));
+}
+
+// Create a 4D blob from a frame
+blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
+
+// Check std values.
+if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
+{
+    // Divide blob by std.
+    divide(blob, std, blob);
+}
+```
+
+In this step we use cv::dnn::blobFromImage function to prepare model input.
+We set ``Size(rszWidth, rszHeight)`` with  ``--initial_width=256 --initial_height=256`` for the initial image resize as it's described in [PyTorch ResNet inference pipeline](https://pytorch.org/hub/pytorch_vision_resnet/).
+
+It should be noted that firstly in cv::dnn::blobFromImage mean value is subtracted and only then pixel values are multiplied by scale.
+Thus, we use ``--mean="123.675 116.28 103.53"``, which is equivalent to ``[0.485, 0.456, 0.406]`` multiplied by ``255.0`` to reproduce the original image preprocessing order for PyTorch classification models:
+
+```python
+img /= 255.0
+img -= [0.485, 0.456, 0.406]
+img /= [0.229, 0.224, 0.225]
+```
+
+* make forward pass:
+
+```cpp
+net.setInput(blob);
+Mat prob = net.forward();
+```
+
+* process the prediction:
+
+```cpp
+Point classIdPoint;
+double confidence;
+minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
+int classId = classIdPoint.x;
+```
+
+Here we choose the most likely object class. The ``classId`` result for our case is 335 - fox squirrel, eastern fox squirrel, Sciurus niger:
+
+![ResNet50 OpenCV C++ inference output](images/opencv_resnet50_test_res_c.jpg)
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_tutorial.md
new file mode 100644
index 0000000000..409d2f5a49
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_classification/pytorch_cls_model_conversion_tutorial.md
@@ -0,0 +1,362 @@
+# Conversion of PyTorch Classification Models and Launch with OpenCV Python {#pytorch_cls_tutorial_dnn_conversion}
+
+@prev_tutorial{tutorial_dnn_OCR}
+@next_tutorial{pytorch_cls_c_tutorial_dnn_conversion}
+
+|    |    |
+| -: | :- |
+| Original author | Anastasia Murzova |
+| Compatibility | OpenCV >= 4.5 |
+
+## Goals
+In this tutorial you will learn how to:
+* convert PyTorch classification models into ONNX format
+* run converted PyTorch model with OpenCV Python API
+* obtain an evaluation of the PyTorch and OpenCV DNN models.
+
+We will explore the above-listed points by the example of the ResNet-50 architecture.
+
+## Introduction
+Let's briefly view the key concepts involved in the pipeline of PyTorch models transition with OpenCV API. The initial step in conversion of PyTorch models into cv.dnn.Net
+is model transferring into [ONNX](https://onnx.ai/about.html) format. ONNX aims at the interchangeability of the neural networks between various frameworks. There is a built-in function in PyTorch for ONNX conversion: [``torch.onnx.export``](https://pytorch.org/docs/stable/onnx.html#torch.onnx.export).
+Further the obtained ``.onnx`` model is passed into cv.dnn.readNetFromONNX.
+
+## Requirements
+To be able to experiment with the below code you will need to install a set of libraries. We will use a virtual environment with python3.7+ for this:
+
+```console
+virtualenv -p /usr/bin/python3.7 <env_dir_path>
+source <env_dir_path>/bin/activate
+```
+
+For OpenCV-Python building from source, follow the corresponding instructions from the @ref tutorial_py_table_of_contents_setup.
+
+Before you start the installation of the libraries, you can customize the [requirements.txt](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt), excluding or including (for example, ``opencv-python``) some dependencies.
+The below line initiates requirements installation into the previously activated virtual environment:
+
+```console
+pip install -r requirements.txt
+```
+
+## Practice
+In this part we are going to cover the following points:
+1. create a classification model conversion pipeline and provide the inference
+2. evaluate and test classification models
+
+If you'd like merely to run evaluation or test model pipelines, the "Model Conversion Pipeline" part can be skipped.
+
+### Model Conversion Pipeline
+The code in this subchapter is located in the ``dnn_model_runner`` module and can be executed with the line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_resnet50
+```
+
+The following code contains the description of the below-listed steps:
+1. instantiate PyTorch model
+2. convert PyTorch model into ``.onnx``
+3. read the transferred network with OpenCV API
+4. prepare input data
+5. provide inference
+
+```python
+# initialize PyTorch ResNet-50 model
+original_model = models.resnet50(pretrained=True)
+
+# get the path to the converted into ONNX PyTorch model
+full_model_path = get_pytorch_onnx_model(original_model)
+
+# read converted .onnx model with OpenCV API
+opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
+print("OpenCV model was successfully read. Layer IDs: \n", opencv_net.getLayerNames())
+
+# get preprocessed image
+input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
+
+# get ImageNet labels
+imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
+
+# obtain OpenCV DNN predictions
+get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
+
+# obtain original PyTorch ResNet50 predictions
+get_pytorch_dnn_prediction(original_model, input_img, imagenet_labels)
+```
+
+To provide model inference we will use the below [squirrel photo](https://www.pexels.com/photo/brown-squirrel-eating-1564292) (under [CC0](https://www.pexels.com/terms-of-service/) license) corresponding to ImageNet class ID 335:
+```console
+fox squirrel, eastern fox squirrel, Sciurus niger
+```
+
+![Classification model input image](images/squirrel_cls.jpg)
+
+For the label decoding of the obtained prediction, we also need ``imagenet_classes.txt`` file, which contains the full list of the ImageNet classes.
+
+Let's go deeper into each step by the example of pretrained PyTorch ResNet-50:
+*  instantiate PyTorch ResNet-50 model:
+
+```python
+# initialize PyTorch ResNet-50 model
+original_model = models.resnet50(pretrained=True)
+```
+
+*  convert PyTorch model into ONNX:
+
+```python
+# define the directory for further converted model save
+onnx_model_path = "models"
+# define the name of further converted model
+onnx_model_name = "resnet50.onnx"
+
+# create directory for further converted model
+os.makedirs(onnx_model_path, exist_ok=True)
+
+# get full path to the converted model
+full_model_path = os.path.join(onnx_model_path, onnx_model_name)
+
+# generate model input
+generated_input = Variable(
+    torch.randn(1, 3, 224, 224)
+)
+
+# model export into ONNX format
+torch.onnx.export(
+    original_model,
+    generated_input,
+    full_model_path,
+    verbose=True,
+    input_names=["input"],
+    output_names=["output"],
+    opset_version=11
+)
+```
+
+After the successful execution of the above code, we will get ``models/resnet50.onnx``.
+
+* read the transferred network with cv.dnn.readNetFromONNX passing the obtained in the previous step ONNX model into it:
+
+```python
+# read converted .onnx model with OpenCV API
+opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
+```
+
+* prepare input data:
+
+```python
+# read the image
+input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+input_img = input_img.astype(np.float32)
+
+input_img = cv2.resize(input_img, (256, 256))
+
+# define preprocess parameters
+mean = np.array([0.485, 0.456, 0.406]) * 255.0
+scale = 1 / 255.0
+std = [0.229, 0.224, 0.225]
+
+# prepare input blob to fit the model input:
+# 1. subtract mean
+# 2. scale to set pixel values from 0 to 1
+input_blob = cv2.dnn.blobFromImage(
+    image=input_img,
+    scalefactor=scale,
+    size=(224, 224),  # img target size
+    mean=mean,
+    swapRB=True,  # BGR -> RGB
+    crop=True  # center crop
+)
+# 3. divide by std
+input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
+```
+
+In this step we read the image and prepare model input with cv.dnn.blobFromImage function, which returns 4-dimensional blob.
+It should be noted that firstly in cv.dnn.blobFromImage mean value is subtracted and only then pixel values are multiplied by scale. Thus, ``mean`` is multiplied by ``255.0`` to reproduce the original image preprocessing order:
+
+```python
+img /= 255.0
+img -= [0.485, 0.456, 0.406]
+img /= [0.229, 0.224, 0.225]
+```
+
+* OpenCV cv.dnn.Net inference:
+
+```python
+# set OpenCV DNN input
+opencv_net.setInput(preproc_img)
+
+# OpenCV DNN inference
+out = opencv_net.forward()
+print("OpenCV DNN prediction: \n")
+print("* shape: ", out.shape)
+
+# get the predicted class ID
+imagenet_class_id = np.argmax(out)
+
+# get confidence
+confidence = out[0][imagenet_class_id]
+print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+print("* confidence: {:.4f}".format(confidence))
+```
+
+After the above code execution we will get the following output:
+
+```console
+OpenCV DNN prediction:
+* shape:  (1, 1000)
+* class ID: 335, label: fox squirrel, eastern fox squirrel, Sciurus niger
+* confidence: 14.8308
+```
+
+* PyTorch ResNet-50 model inference:
+
+```python
+original_net.eval()
+preproc_img = torch.FloatTensor(preproc_img)
+
+# inference
+out = original_net(preproc_img)
+print("\nPyTorch model prediction: \n")
+print("* shape: ", out.shape)
+
+# get the predicted class ID
+imagenet_class_id = torch.argmax(out, axis=1).item()
+print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+
+# get confidence
+confidence = out[0][imagenet_class_id]
+print("* confidence: {:.4f}".format(confidence.item()))
+```
+
+After the above code launching we will get the following output:
+
+```console
+PyTorch model prediction:
+* shape:  torch.Size([1, 1000])
+* class ID: 335, label: fox squirrel, eastern fox squirrel, Sciurus niger
+* confidence: 14.8308
+```
+
+The inference results of the original ResNet-50 model and cv.dnn.Net are equal. For the extended evaluation of the models we can use ``py_to_py_cls`` of the ``dnn_model_runner`` module. This module part will be described in the next subchapter.
+
+### Evaluation of the Models
+
+The proposed in ``samples/dnn`` ``dnn_model_runner`` module allows to run the full evaluation pipeline on the ImageNet dataset and test execution for the following PyTorch classification models:
+* alexnet
+* vgg11
+* vgg13
+* vgg16
+* vgg19
+* resnet18
+* resnet34
+* resnet50
+* resnet101
+* resnet152
+* squeezenet1_0
+* squeezenet1_1
+* resnext50_32x4d
+* resnext101_32x8d
+* wide_resnet50_2
+* wide_resnet101_2
+
+This list can be also extended with further appropriate evaluation pipeline configuration.
+
+#### Evaluation Mode
+
+The below line represents running of the module in the evaluation mode:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name <pytorch_cls_model_name>
+```
+
+Chosen from the list classification model will be read into OpenCV cv.dnn.Net object. Evaluation results of PyTorch and OpenCV models (accuracy, inference time, L1) will be written into the log file. Inference time values will be also depicted in a chart to generalize the obtained model information.
+
+Necessary evaluation configurations are defined in the [test_config.py](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) and can be modified in accordance with actual paths of data location:
+
+```python
+@dataclass
+class TestClsConfig:
+    batch_size: int = 50
+    frame_size: int = 224
+    img_root_dir: str = "./ILSVRC2012_img_val"
+    # location of image-class matching
+    img_cls_file: str = "./val.txt"
+    bgr_to_rgb: bool = True
+```
+
+To initiate the evaluation of the PyTorch ResNet-50, run the following line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name resnet50
+```
+
+After script launch, the log file with evaluation data will be generated in ``dnn_model_runner/dnn_conversion/logs``:
+
+```console
+The model PyTorch resnet50 was successfully obtained and converted to OpenCV DNN resnet50
+===== Running evaluation of the model with the following params:
+    * val data location: ./ILSVRC2012_img_val
+    * log file location: dnn_model_runner/dnn_conversion/logs/PyTorch_resnet50_log.txt
+```
+
+#### Test Mode
+
+The below line represents running of the module in the test mode, namely it provides the steps for the model inference:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name <pytorch_cls_model_name> --test True --default_img_preprocess <True/False> --evaluate False
+```
+
+Here ``default_img_preprocess`` key defines whether you'd like to parametrize the model test process with some particular values or use the default values, for example, ``scale``, ``mean`` or ``std``.
+
+Test configuration is represented in [test_config.py](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) ``TestClsModuleConfig`` class:
+
+```python
+@dataclass
+class TestClsModuleConfig:
+    cls_test_data_dir: str = "../data"
+    test_module_name: str = "classification"
+    test_module_path: str = "classification.py"
+    input_img: str = os.path.join(cls_test_data_dir, "squirrel_cls.jpg")
+    model: str = ""
+
+    frame_height: str = str(TestClsConfig.frame_size)
+    frame_width: str = str(TestClsConfig.frame_size)
+    scale: str = "1.0"
+    mean: List[str] = field(default_factory=lambda: ["0.0", "0.0", "0.0"])
+    std: List[str] = field(default_factory=list)
+    crop: str = "False"
+    rgb: str = "True"
+    rsz_height: str = ""
+    rsz_width: str = ""
+    classes: str = os.path.join(cls_test_data_dir, "dnn", "classification_classes_ILSVRC2012.txt")
+```
+
+The default image preprocessing options are defined in [default_preprocess_config.py](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/default_preprocess_config.py). For instance:
+
+```python
+BASE_IMG_SCALE_FACTOR = 1 / 255.0
+PYTORCH_RSZ_HEIGHT = 256
+PYTORCH_RSZ_WIDTH = 256
+
+pytorch_resize_input_blob = {
+    "mean": ["123.675", "116.28", "103.53"],
+    "scale": str(BASE_IMG_SCALE_FACTOR),
+    "std": ["0.229", "0.224", "0.225"],
+    "crop": "True",
+    "rgb": "True",
+    "rsz_height": str(PYTORCH_RSZ_HEIGHT),
+    "rsz_width": str(PYTORCH_RSZ_WIDTH)
+}
+```
+
+The basis of the model testing is represented in [samples/dnn/classification.py](https://github.com/opencv/opencv/blob/master/samples/dnn/classification.py).  ``classification.py`` can be executed autonomously with provided converted model in ``--input`` and populated parameters for cv.dnn.blobFromImage.
+
+To reproduce from scratch the described in "Model Conversion Pipeline" OpenCV steps with ``dnn_model_runner`` execute the below line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.pytorch.classification.py_to_py_cls --model_name resnet50 --test True --default_img_preprocess True --evaluate False
+```
+
+The network prediction is depicted in the top left corner of the output window:
+
+![ResNet50 OpenCV inference output](images/pytorch_resnet50_opencv_test_res.jpg)
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_classification/tf_cls_model_conversion_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_classification/tf_cls_model_conversion_tutorial.md
new file mode 100644
index 0000000000..c2da541029
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_classification/tf_cls_model_conversion_tutorial.md
@@ -0,0 +1,360 @@
+# Conversion of TensorFlow Classification Models and Launch with OpenCV Python {#tf_cls_tutorial_dnn_conversion}
+
+|    |    |
+| -: | :- |
+| Original author | Anastasia Murzova |
+| Compatibility | OpenCV >= 4.5 |
+
+## Goals
+In this tutorial you will learn how to:
+* obtain frozen graphs of TensorFlow (TF) classification models
+* run converted TensorFlow model with OpenCV Python API
+* obtain an evaluation of the TensorFlow and OpenCV DNN models
+
+We will explore the above-listed points by the example of MobileNet architecture.
+
+## Introduction
+Let's briefly view the key concepts involved in the pipeline of TensorFlow models transition with OpenCV API. The initial step in conversion of TensorFlow models into cv.dnn.Net
+is obtaining the frozen TF model graph. Frozen graph defines the combination of the model graph structure with kept values of the required variables, for example, weights. Usually the frozen graph is saved in [protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) (```.pb```) files.
+After the model ``.pb`` file was generated it can be read with cv.dnn.readNetFromTensorflow function.
+
+## Requirements
+To be able to experiment with the below code you will need to install a set of libraries. We will use a virtual environment with python3.7+ for this:
+
+```console
+virtualenv -p /usr/bin/python3.7 <env_dir_path>
+source <env_dir_path>/bin/activate
+```
+
+For OpenCV-Python building from source, follow the corresponding instructions from the @ref tutorial_py_table_of_contents_setup.
+
+Before you start the installation of the libraries, you can customize the [requirements.txt](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt), excluding or including (for example, ``opencv-python``) some dependencies.
+The below line initiates requirements installation into the previously activated virtual environment:
+
+```console
+pip install -r requirements.txt
+```
+
+## Practice
+In this part we are going to cover the following points:
+1. create a TF classification model conversion pipeline and provide the inference
+2. evaluate and test TF classification models
+
+If you'd like merely to run evaluation or test model pipelines, the "Model Conversion Pipeline" tutorial part can be skipped.
+
+### Model Conversion Pipeline
+The code in this subchapter is located in the ``dnn_model_runner`` module and can be executed with the line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.classification.py_to_py_mobilenet
+```
+
+The following code contains the description of the below-listed steps:
+1. instantiate TF model
+2. create TF frozen graph
+3. read TF frozen graph with OpenCV API
+4. prepare input data
+5. provide inference
+
+```python
+# initialize TF MobileNet model
+original_tf_model = MobileNet(
+    include_top=True,
+    weights="imagenet"
+)
+
+# get TF frozen graph path
+full_pb_path = get_tf_model_proto(original_tf_model)
+
+# read frozen graph with OpenCV API
+opencv_net = cv2.dnn.readNetFromTensorflow(full_pb_path)
+print("OpenCV model was successfully read. Model layers: \n", opencv_net.getLayerNames())
+
+# get preprocessed image
+input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
+
+# get ImageNet labels
+imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
+
+# obtain OpenCV DNN predictions
+get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
+
+# obtain TF model predictions
+get_tf_dnn_prediction(original_tf_model, input_img, imagenet_labels)
+```
+
+To provide model inference we will use the below [squirrel photo](https://www.pexels.com/photo/brown-squirrel-eating-1564292) (under [CC0](https://www.pexels.com/terms-of-service/) license) corresponding to ImageNet class ID 335:
+```console
+fox squirrel, eastern fox squirrel, Sciurus niger
+```
+
+![Classification model input image](images/squirrel_cls.jpg)
+
+For the label decoding of the obtained prediction, we also need ``imagenet_classes.txt`` file, which contains the full list of the ImageNet classes.
+
+Let's go deeper into each step by the example of pretrained TF MobileNet:
+* instantiate TF model:
+
+```python
+# initialize TF MobileNet model
+original_tf_model = MobileNet(
+    include_top=True,
+    weights="imagenet"
+)
+```
+
+* create TF frozen graph
+
+```python
+# define the directory for .pb model
+pb_model_path = "models"
+
+# define the name of .pb model
+pb_model_name = "mobilenet.pb"
+
+# create directory for further converted model
+os.makedirs(pb_model_path, exist_ok=True)
+
+# get model TF graph
+tf_model_graph = tf.function(lambda x: tf_model(x))
+
+# get concrete function
+tf_model_graph = tf_model_graph.get_concrete_function(
+    tf.TensorSpec(tf_model.inputs[0].shape, tf_model.inputs[0].dtype))
+
+# obtain frozen concrete function
+frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
+# get frozen graph
+frozen_tf_func.graph.as_graph_def()
+
+# save full tf model
+tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
+                  logdir=pb_model_path,
+                  name=pb_model_name,
+                  as_text=False)
+```
+
+After the successful execution of the above code, we will get a frozen graph in ``models/mobilenet.pb``.
+
+* read TF frozen graph with with cv.dnn.readNetFromTensorflow passing the obtained in the previous step ``mobilenet.pb`` into it:
+
+```python
+# get TF frozen graph path
+full_pb_path = get_tf_model_proto(original_tf_model)
+```
+
+* prepare input data with cv2.dnn.blobFromImage function:
+
+```python
+# read the image
+input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+input_img = input_img.astype(np.float32)
+
+# define preprocess parameters
+mean = np.array([1.0, 1.0, 1.0]) * 127.5
+scale = 1 / 127.5
+
+# prepare input blob to fit the model input:
+# 1. subtract mean
+# 2. scale to set pixel values from 0 to 1
+input_blob = cv2.dnn.blobFromImage(
+    image=input_img,
+    scalefactor=scale,
+    size=(224, 224),  # img target size
+    mean=mean,
+    swapRB=True,  # BGR -> RGB
+    crop=True  # center crop
+)
+print("Input blob shape: {}\n".format(input_blob.shape))
+```
+
+Please, pay attention at the preprocessing order in the cv2.dnn.blobFromImage function. Firstly, the mean value is subtracted and only then pixel values are multiplied by the defined scale.
+Therefore, to reproduce the image preprocessing pipeline from the TF [``mobilenet.preprocess_input``](https://github.com/tensorflow/tensorflow/blob/02032fb477e9417197132648ec81e75beee9063a/tensorflow/python/keras/applications/mobilenet.py#L443-L445) function, we multiply ``mean`` by ``127.5``.
+
+As a result, 4-dimensional ``input_blob`` was obtained:
+
+ ``Input blob shape: (1, 3, 224, 224)``
+
+* provide OpenCV cv.dnn.Net inference:
+
+```python
+# set OpenCV DNN input
+opencv_net.setInput(preproc_img)
+
+# OpenCV DNN inference
+out = opencv_net.forward()
+print("OpenCV DNN prediction: \n")
+print("* shape: ", out.shape)
+
+# get the predicted class ID
+imagenet_class_id = np.argmax(out)
+
+# get confidence
+confidence = out[0][imagenet_class_id]
+print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+print("* confidence: {:.4f}\n".format(confidence))
+```
+
+After the above code execution we will get the following output:
+
+```console
+OpenCV DNN prediction:
+* shape:  (1, 1000)
+* class ID: 335, label: fox squirrel, eastern fox squirrel, Sciurus niger
+* confidence: 0.9525
+```
+
+* provide TF MobileNet inference:
+
+```python
+# inference
+preproc_img = preproc_img.transpose(0, 2, 3, 1)
+print("TF input blob shape: {}\n".format(preproc_img.shape))
+
+out = original_net(preproc_img)
+
+print("\nTensorFlow model prediction: \n")
+print("* shape: ", out.shape)
+
+# get the predicted class ID
+imagenet_class_id = np.argmax(out)
+print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+
+# get confidence
+confidence = out[0][imagenet_class_id]
+print("* confidence: {:.4f}".format(confidence))
+```
+
+To fit TF model input, ``input_blob`` was transposed:
+
+```console
+TF input blob shape: (1, 224, 224, 3)
+```
+
+TF inference results are the following:
+
+```console
+TensorFlow model prediction:
+* shape:  (1, 1000)
+* class ID: 335, label: fox squirrel, eastern fox squirrel, Sciurus niger
+* confidence: 0.9525
+```
+
+As it can be seen from the experiments OpenCV and TF inference results are equal.
+
+### Evaluation of the Models
+
+The proposed in ``dnn/samples`` ``dnn_model_runner`` module allows to run the full evaluation pipeline on the ImageNet dataset and test execution for the following TensorFlow classification models:
+* vgg16
+* vgg19
+* resnet50
+* resnet101
+* resnet152
+* densenet121
+* densenet169
+* densenet201
+* inceptionresnetv2
+* inceptionv3
+* mobilenet
+* mobilenetv2
+* nasnetlarge
+* nasnetmobile
+* xception
+
+This list can be also extended with further appropriate evaluation pipeline configuration.
+
+#### Evaluation Mode
+
+To below line represents running of the module in the evaluation mode:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.classification.py_to_py_cls --model_name <tf_cls_model_name>
+```
+
+Chosen from the list classification model will be read into OpenCV ``cv.dnn_Net`` object. Evaluation results of TF and OpenCV models (accuracy, inference time, L1) will be written into the log file. Inference time values will be also depicted in a chart to generalize the obtained model information.
+
+Necessary evaluation configurations are defined in the [test_config.py](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) and can be modified in accordance with actual paths of data location::
+
+```python
+@dataclass
+class TestClsConfig:
+    batch_size: int = 50
+    frame_size: int = 224
+    img_root_dir: str = "./ILSVRC2012_img_val"
+    # location of image-class matching
+    img_cls_file: str = "./val.txt"
+    bgr_to_rgb: bool = True
+```
+
+The values from ``TestClsConfig`` can be customized in accordance with chosen model.
+
+To initiate the evaluation of the TensorFlow MobileNet, run the following line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.classification.py_to_py_cls --model_name mobilenet
+```
+
+After script launch, the log file with evaluation data will be generated in ``dnn_model_runner/dnn_conversion/logs``:
+
+```console
+===== Running evaluation of the model with the following params:
+    * val data location: ./ILSVRC2012_img_val
+    * log file location: dnn_model_runner/dnn_conversion/logs/TF_mobilenet_log.txt
+```
+
+#### Test Mode
+
+The below line represents running of the module in the test mode, namely it provides the steps for the model inference:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.classification.py_to_py_cls --model_name <tf_cls_model_name> --test True --default_img_preprocess <True/False> --evaluate False
+```
+
+Here ``default_img_preprocess`` key defines whether you'd like to parametrize the model test process with some particular values or use the default values, for example, ``scale``, ``mean`` or ``std``.
+
+Test configuration is represented in [test_config.py](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) ``TestClsModuleConfig`` class:
+
+```python
+@dataclass
+class TestClsModuleConfig:
+    cls_test_data_dir: str = "../data"
+    test_module_name: str = "classification"
+    test_module_path: str = "classification.py"
+    input_img: str = os.path.join(cls_test_data_dir, "squirrel_cls.jpg")
+    model: str = ""
+
+    frame_height: str = str(TestClsConfig.frame_size)
+    frame_width: str = str(TestClsConfig.frame_size)
+    scale: str = "1.0"
+    mean: List[str] = field(default_factory=lambda: ["0.0", "0.0", "0.0"])
+    std: List[str] = field(default_factory=list)
+    crop: str = "False"
+    rgb: str = "True"
+    rsz_height: str = ""
+    rsz_width: str = ""
+    classes: str = os.path.join(cls_test_data_dir, "dnn", "classification_classes_ILSVRC2012.txt")
+```
+
+The default image preprocessing options are defined in ``default_preprocess_config.py``. For instance, for MobileNet:
+
+```python
+tf_input_blob = {
+    "mean": ["127.5", "127.5", "127.5"],
+    "scale": str(1 / 127.5),
+    "std": [],
+    "crop": "True",
+    "rgb": "True"
+}
+```
+
+The basis of the model testing is represented in [samples/dnn/classification.py](https://github.com/opencv/opencv/blob/master/samples/dnn/classification.py). ``classification.py`` can be executed autonomously with provided converted model in ``--input`` and populated parameters for cv.dnn.blobFromImage.
+
+To reproduce from scratch the described in "Model Conversion Pipeline" OpenCV steps with ``dnn_model_runner`` execute the below line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.classification.py_to_py_cls --model_name mobilenet --test True --default_img_preprocess True --evaluate False
+```
+
+The network prediction is depicted in the top left corner of the output window:
+
+![TF MobileNet OpenCV inference output](images/tf_mobilenet_opencv_test_res.jpg)
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/opencv_bus_res.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/opencv_bus_res.jpg
new file mode 100644
index 0000000000..8bdc602068
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/opencv_bus_res.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/pexels_double_decker_bus.jpg b/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/pexels_double_decker_bus.jpg
new file mode 100644
index 0000000000..aca8be09eb
Binary files /dev/null and b/doc/tutorials/dnn/dnn_pytorch_tf_detection/images/pexels_double_decker_bus.jpg differ
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_detection/tf_det_model_conversion_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_detection/tf_det_model_conversion_tutorial.md
new file mode 100644
index 0000000000..04388cbaf4
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_detection/tf_det_model_conversion_tutorial.md
@@ -0,0 +1,140 @@
+# Conversion of TensorFlow Detection Models and Launch with OpenCV Python {#tf_det_tutorial_dnn_conversion}
+
+|    |    |
+| -: | :- |
+| Original author | Anastasia Murzova |
+| Compatibility | OpenCV >= 4.5 |
+
+## Goals
+In this tutorial you will learn how to:
+* obtain frozen graphs of TensorFlow (TF) detection models
+* run converted TensorFlow model with OpenCV Python API
+
+We will explore the above-listed points by the example of SSD MobileNetV1.
+
+## Introduction
+Let's briefly view the key concepts involved in the pipeline of TensorFlow models transition with OpenCV API. The initial step in the conversion of TensorFlow models into cv.dnn.Net
+is obtaining the frozen TF model graph. A frozen graph defines the combination of the model graph structure with kept values of the required variables, for example, weights. The frozen graph is saved in [protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) (```.pb```) files.
+There are special functions for reading ``.pb`` graphs in OpenCV: cv.dnn.readNetFromTensorflow and cv.dnn.readNet.
+
+## Requirements
+To be able to experiment with the below code you will need to install a set of libraries. We will use a virtual environment with python3.7+ for this:
+
+```console
+virtualenv -p /usr/bin/python3.7 <env_dir_path>
+source <env_dir_path>/bin/activate
+```
+
+For OpenCV-Python building from source, follow the corresponding instructions from the @ref tutorial_py_table_of_contents_setup.
+
+Before you start the installation of the libraries, you can customize the [requirements.txt](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt), excluding or including (for example, ``opencv-python``) some dependencies.
+The below line initiates requirements installation into the previously activated virtual environment:
+
+```console
+pip install -r requirements.txt
+```
+
+## Practice
+In this part we are going to cover the following points:
+1. create a TF classification model conversion pipeline and provide the inference
+2. provide the inference, process prediction results
+
+### Model Preparation
+The code in this subchapter is located in the ``samples/dnn/dnn_model_runner`` module and can be executed with the below line:
+
+```console
+python -m dnn_model_runner.dnn_conversion.tf.detection.py_to_py_ssd_mobilenet
+```
+
+The following code contains the steps of the TF SSD MobileNetV1 model retrieval:
+
+```python
+    tf_model_name = 'ssd_mobilenet_v1_coco_2017_11_17'
+    graph_extraction_dir = "./"
+    frozen_graph_path = extract_tf_frozen_graph(tf_model_name, graph_extraction_dir)
+    print("Frozen graph path for {}: {}".format(tf_model_name, frozen_graph_path))
+```
+
+In ``extract_tf_frozen_graph`` function we extract the provided in model archive ``frozen_inference_graph.pb`` for its further processing:
+
+```python
+# define model archive name
+tf_model_tar = model_name + '.tar.gz'
+# define link to retrieve model archive
+model_link = DETECTION_MODELS_URL + tf_model_tar
+
+tf_frozen_graph_name = 'frozen_inference_graph'
+
+try:
+    urllib.request.urlretrieve(model_link, tf_model_tar)
+except Exception:
+    print("TF {} was not retrieved: {}".format(model_name, model_link))
+    return
+
+print("TF {} was retrieved.".format(model_name))
+
+tf_model_tar = tarfile.open(tf_model_tar)
+frozen_graph_path = ""
+
+for model_tar_elem in tf_model_tar.getmembers():
+    if tf_frozen_graph_name in os.path.basename(model_tar_elem.name):
+        tf_model_tar.extract(model_tar_elem, extracted_model_path)
+        frozen_graph_path = os.path.join(extracted_model_path, model_tar_elem.name)
+        break
+tf_model_tar.close()
+```
+
+After the successful execution of the above code we will get the following output:
+
+```console
+TF ssd_mobilenet_v1_coco_2017_11_17 was retrieved.
+Frozen graph path for ssd_mobilenet_v1_coco_2017_11_17: ./ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb
+```
+
+To provide model inference we will use the below [double-decker bus photo](https://www.pexels.com/photo/bus-and-car-on-one-way-street-3626589/) (under [Pexels](https://www.pexels.com/license/) license):
+
+![Double-decker bus](images/pexels_double_decker_bus.jpg)
+
+To initiate the test process we need to provide an appropriate model configuration. We will use [``ssd_mobilenet_v1_coco.config``](https://github.com/tensorflow/models/blob/master/research/object_detection/samples/configs/ssd_mobilenet_v1_coco.config) from [TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection#tensorflow-object-detection-api).
+TensorFlow Object Detection API framework contains helpful mechanisms for object detection model manipulations.
+
+We will use this configuration to provide a text graph representation. To generate ``.pbtxt`` we will use the corresponding [``samples/dnn/tf_text_graph_ssd.py``](https://github.com/opencv/opencv/blob/master/samples/dnn/tf_text_graph_ssd.py) script:
+
+```console
+python tf_text_graph_ssd.py --input ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb --config ssd_mobilenet_v1_coco_2017_11_17/ssd_mobilenet_v1_coco.config --output ssd_mobilenet_v1_coco_2017_11_17.pbtxt
+```
+
+After successful execution ``ssd_mobilenet_v1_coco_2017_11_17.pbtxt`` will be created.
+
+Before we run ``object_detection.py``, let's have a look at the default values for the SSD MobileNetV1 test process configuration. They are located in [``models.yml``](https://github.com/opencv/opencv/blob/master/samples/dnn/models.yml):
+
+```yml
+ssd_tf:
+  model: "ssd_mobilenet_v1_coco_2017_11_17.pb"
+  config: "ssd_mobilenet_v1_coco_2017_11_17.pbtxt"
+  mean: [0, 0, 0]
+  scale: 1.0
+  width: 300
+  height: 300
+  rgb: true
+  classes: "object_detection_classes_coco.txt"
+  sample: "object_detection"
+```
+
+To fetch these values we need to provide frozen graph ``ssd_mobilenet_v1_coco_2017_11_17.pb`` model and text graph ``ssd_mobilenet_v1_coco_2017_11_17.pbtxt``:
+
+```console
+python object_detection.py ssd_tf --input ../data/pexels_double_decker_bus.jpg
+```
+
+This line is equivalent to:
+
+```console
+python object_detection.py --model ssd_mobilenet_v1_coco_2017_11_17.pb --config  ssd_mobilenet_v1_coco_2017_11_17.pbtxt  --input ../data/pexels_double_decker_bus.jpg --width 300 --height 300 --classes ../data/dnn/object_detection_classes_coco.txt
+```
+
+The result is:
+
+![OpenCV SSD bus result](images/opencv_bus_res.jpg)
+
+There are several helpful parameters, which can be also customized for result corrections: threshold (``--thr``) and non-maximum suppression (``--nms``) values.
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/pytorch_sem_segm_model_conversion_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/pytorch_sem_segm_model_conversion_tutorial.md
new file mode 100644
index 0000000000..368007ee22
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/pytorch_sem_segm_model_conversion_tutorial.md
@@ -0,0 +1,332 @@
+# Conversion of PyTorch Segmentation Models and Launch with OpenCV {#pytorch_segm_tutorial_dnn_conversion}
+
+## Goals
+In this tutorial you will learn how to:
+* convert PyTorch segmentation models
+* run converted PyTorch model with OpenCV
+* obtain an evaluation of the PyTorch and OpenCV DNN models
+
+We will explore the above-listed points by the example of the FCN ResNet-50 architecture.
+
+## Introduction
+The key points involved in the transition pipeline of the [PyTorch classification](https://link_to_cls_tutorial) and segmentation models with OpenCV API are equal. The first step is model transferring into [ONNX](https://onnx.ai/about.html) format with PyTorch [``torch.onnx.export``](https://pytorch.org/docs/stable/onnx.html#torch.onnx.export) built-in function.
+Further the obtained ``.onnx`` model is passed into cv.dnn.readNetFromONNX, which returns cv.dnn.Net object ready for DNN manipulations.
+
+## Practice
+In this part we are going to cover the following points:
+1. create a segmentation model conversion pipeline and provide the inference
+2. evaluate and test segmentation models
+
+If you'd like merely to run evaluation or test model pipelines, the "Model Conversion Pipeline" part can be skipped.
+
+### Model Conversion Pipeline
+The code in this subchapter is located in the ``dnn_model_runner`` module and can be executed with the line:
+
+``
+python -m dnn_model_runner.dnn_conversion.pytorch.segmentation.py_to_py_fcnresnet50
+``
+
+The following code contains the description of the below-listed steps:
+1. instantiate PyTorch model
+2. convert PyTorch model into ``.onnx``
+3. read the transferred network with OpenCV API
+4. prepare input data
+5. provide inference
+6. get colored masks from predictions
+7. visualize results
+
+```python
+# initialize PyTorch FCN ResNet-50 model
+original_model = models.segmentation.fcn_resnet50(pretrained=True)
+
+# get the path to the converted into ONNX PyTorch model
+full_model_path = get_pytorch_onnx_model(original_model)
+
+# read converted .onnx model with OpenCV API
+opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
+print("OpenCV model was successfully read. Layer IDs: \n", opencv_net.getLayerNames())
+
+# get preprocessed image
+img, input_img = get_processed_imgs("test_data/sem_segm/2007_000033.jpg")
+
+# obtain OpenCV DNN predictions
+opencv_prediction = get_opencv_dnn_prediction(opencv_net, input_img)
+
+# obtain original PyTorch ResNet50 predictions
+pytorch_prediction = get_pytorch_dnn_prediction(original_model, input_img)
+
+pascal_voc_classes, pascal_voc_colors = read_colors_info("test_data/sem_segm/pascal-classes.txt")
+
+# obtain colored segmentation masks
+opencv_colored_mask = get_colored_mask(img.shape, opencv_prediction, pascal_voc_colors)
+pytorch_colored_mask = get_colored_mask(img.shape, pytorch_prediction, pascal_voc_colors)
+
+# obtain palette of PASCAL VOC colors
+color_legend = get_legend(pascal_voc_classes, pascal_voc_colors)
+
+cv2.imshow('PyTorch Colored Mask', pytorch_colored_mask)
+cv2.imshow('OpenCV DNN Colored Mask', opencv_colored_mask)
+cv2.imshow('Color Legend', color_legend)
+
+cv2.waitKey(0)
+```
+
+To provide the model inference we will use the below picture from the [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/) validation dataset:
+
+![PASCAL VOC img](images/2007_000033.jpg)
+
+The target segmented result is:
+
+![PASCAL VOC ground truth](images/2007_000033.png)
+
+For the PASCAL VOC colors decoding and its mapping with the predicted masks, we also need ``pascal-classes.txt`` file, which contains the full list of the PASCAL VOC classes and corresponding colors.
+
+Let's go deeper into each code step by the example of pretrained PyTorch FCN ResNet-50:
+*  instantiate PyTorch FCN ResNet-50 model:
+
+```python
+# initialize PyTorch FCN ResNet-50 model
+original_model = models.segmentation.fcn_resnet50(pretrained=True)
+```
+
+*  convert PyTorch model into ONNX format:
+
+```python
+# define the directory for further converted model save
+onnx_model_path = "models"
+# define the name of further converted model
+onnx_model_name = "fcnresnet50.onnx"
+
+# create directory for further converted model
+os.makedirs(onnx_model_path, exist_ok=True)
+
+# get full path to the converted model
+full_model_path = os.path.join(onnx_model_path, onnx_model_name)
+
+# generate model input to build the graph
+generated_input = Variable(
+    torch.randn(1, 3, 500, 500)
+)
+
+# model export into ONNX format
+torch.onnx.export(
+    original_model,
+    generated_input,
+    full_model_path,
+    verbose=True,
+    input_names=["input"],
+    output_names=["output"],
+    opset_version=11
+)
+```
+
+The code from this step does not differ from the classification conversion case. Thus, after the successful execution of the above code, we will get ``models/fcnresnet50.onnx``.
+
+* read the transferred network with cv.dnn.readNetFromONNX passing the obtained in the previous step ONNX model into it:
+
+```python
+# read converted .onnx model with OpenCV API
+opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
+```
+
+* prepare input data:
+
+```python
+# read the image
+input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+input_img = input_img.astype(np.float32)
+
+# target image sizes
+img_height = input_img.shape[0]
+img_width = input_img.shape[1]
+
+# define preprocess parameters
+mean = np.array([0.485, 0.456, 0.406]) * 255.0
+scale = 1 / 255.0
+std = [0.229, 0.224, 0.225]
+
+# prepare input blob to fit the model input:
+# 1. subtract mean
+# 2. scale to set pixel values from 0 to 1
+input_blob = cv2.dnn.blobFromImage(
+    image=input_img,
+    scalefactor=scale,
+    size=(img_width, img_height),  # img target size
+    mean=mean,
+    swapRB=True,  # BGR -> RGB
+    crop=False  # center crop
+)
+# 3. divide by std
+input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
+```
+
+In this step we read the image and prepare model input with cv2.dnn.blobFromImage function, which returns 4-dimensional blob.
+It should be noted that firstly in ``cv2.dnn.blobFromImage`` mean value is subtracted and only then pixel values are scaled. Thus, ``mean`` is multiplied by ``255.0`` to reproduce the original image preprocessing order:
+
+```python
+img /= 255.0
+img -= [0.485, 0.456, 0.406]
+img /= [0.229, 0.224, 0.225]
+```
+
+* OpenCV ``cv.dnn_Net`` inference:
+
+```python
+# set OpenCV DNN input
+opencv_net.setInput(preproc_img)
+
+# OpenCV DNN inference
+out = opencv_net.forward()
+print("OpenCV DNN segmentation prediction: \n")
+print("* shape: ", out.shape)
+
+# get IDs of predicted classes
+out_predictions = np.argmax(out[0], axis=0)
+```
+
+After the above code execution we will get the following output:
+
+```
+OpenCV DNN segmentation prediction:
+* shape:  (1, 21, 500, 500)
+```
+
+Each prediction channel out of 21, where 21 represents the number of PASCAL VOC classes, contains probabilities, which indicate how likely the pixel corresponds to the PASCAL VOC class.
+
+* PyTorch FCN ResNet-50 model inference:
+
+```python
+original_net.eval()
+preproc_img = torch.FloatTensor(preproc_img)
+
+with torch.no_grad():
+    # obtaining unnormalized probabilities for each class
+    out = original_net(preproc_img)['out']
+
+print("\nPyTorch segmentation model prediction: \n")
+print("* shape: ", out.shape)
+
+# get IDs of predicted classes
+out_predictions = out[0].argmax(dim=0)
+```
+
+After the above code launching we will get the following output:
+
+```
+PyTorch segmentation model prediction:
+* shape:  torch.Size([1, 21, 366, 500])
+```
+
+PyTorch prediction also contains probabilities corresponding to each class prediction.
+
+* get colored masks from predictions:
+
+```python
+# convert mask values into PASCAL VOC colors
+processed_mask = np.stack([colors[color_id] for color_id in segm_mask.flatten()])
+
+# reshape mask into 3-channel image
+processed_mask = processed_mask.reshape(mask_height, mask_width, 3)
+processed_mask = cv2.resize(processed_mask, (img_width, img_height), interpolation=cv2.INTER_NEAREST).astype(
+    np.uint8)
+
+# convert colored mask from BGR to RGB for compatibility with PASCAL VOC colors
+processed_mask = cv2.cvtColor(processed_mask, cv2.COLOR_BGR2RGB)
+```
+
+In this step we map the probabilities from segmentation masks with appropriate colors of the predicted classes. Let's have a look at the results:
+
+![OpenCV Colored Mask](images/legend_opencv_color_mask.png)
+
+For the extended evaluation of the models, we can use ``py_to_py_segm`` script of the ``dnn_model_runner`` module. This module part will be described in the next subchapter.
+
+### Evaluation of the Models
+
+The proposed in ``dnn/samples`` ``dnn_model_runner`` module allows to run the full evaluation pipeline on the PASCAL VOC dataset and test execution for the following PyTorch segmentation models:
+* FCN ResNet-50
+* FCN ResNet-101
+
+This list can be also extended with further appropriate evaluation pipeline configuration.
+
+#### Evaluation Mode
+
+The below line represents running of the module in the evaluation mode:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.segmentation.py_to_py_segm --model_name <pytorch_segm_model_name>
+```
+
+Chosen from the list segmentation model will be read into OpenCV ``cv.dnn_Net`` object. Evaluation results of PyTorch and OpenCV models (pixel accuracy, mean IoU, inference time) will be written into the log file. Inference time values will be also depicted in a chart to generalize the obtained model information.
+
+Necessary evaluation configurations are defined in the [``test_config.py``](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py):
+
+```python
+@dataclass
+class TestSegmConfig:
+    frame_size: int = 500
+    img_root_dir: str = "./VOC2012"
+    img_dir: str = os.path.join(img_root_dir, "JPEGImages/")
+    img_segm_gt_dir: str = os.path.join(img_root_dir, "SegmentationClass/")
+    # reduced val: https://github.com/shelhamer/fcn.berkeleyvision.org/blob/master/data/pascal/seg11valid.txt
+    segm_val_file: str = os.path.join(img_root_dir, "ImageSets/Segmentation/seg11valid.txt")
+    colour_file_cls: str = os.path.join(img_root_dir, "ImageSets/Segmentation/pascal-classes.txt")
+```
+
+These values can be modified in accordance with chosen model pipeline.
+
+To initiate the evaluation of the PyTorch FCN ResNet-50, run the following line:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.segmentation.py_to_py_segm --model_name fcnresnet50
+```
+
+#### Test Mode
+
+The below line represents running of the module in the test mode, which provides the steps for the model inference:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.segmentation.py_to_py_segm --model_name <pytorch_segm_model_name> --test True --default_img_preprocess <True/False> --evaluate False
+```
+
+Here ``default_img_preprocess`` key defines whether you'd like to parametrize the model test process with some particular values or use the default values, for example, ``scale``, ``mean`` or ``std``.
+
+Test configuration is represented in [``test_config.py``](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) ``TestSegmModuleConfig`` class:
+
+```python
+@dataclass
+class TestSegmModuleConfig:
+    segm_test_data_dir: str = "test_data/sem_segm"
+    test_module_name: str = "segmentation"
+    test_module_path: str = "segmentation.py"
+    input_img: str = os.path.join(segm_test_data_dir, "2007_000033.jpg")
+    model: str = ""
+
+    frame_height: str = str(TestSegmConfig.frame_size)
+    frame_width: str = str(TestSegmConfig.frame_size)
+    scale: float = 1.0
+    mean: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+    std: List[float] = field(default_factory=list)
+    crop: bool = False
+    rgb: bool = True
+    classes: str = os.path.join(segm_test_data_dir, "pascal-classes.txt")
+```
+
+The default image preprocessing options are defined in ``default_preprocess_config.py``:
+
+```python
+pytorch_segm_input_blob = {
+    "mean": ["123.675", "116.28", "103.53"],
+    "scale": str(1 / 255.0),
+    "std": ["0.229", "0.224", "0.225"],
+    "crop": "False",
+    "rgb": "True"
+}
+```
+
+The basis of the model testing is represented in ``samples/dnn/segmentation.py``.  ``segmentation.py`` can be executed autonomously with provided converted model in ``--input`` and populated parameters for ``cv2.dnn.blobFromImage``.
+
+To reproduce from scratch the described in "Model Conversion Pipeline" OpenCV steps with ``dnn_model_runner`` execute the below line:
+
+```
+python -m dnn_model_runner.dnn_conversion.pytorch.segmentation.py_to_py_segm --model_name fcnresnet50 --test True --default_img_preprocess True --evaluate False
+```
diff --git a/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/tf_sem_segm_model_conversion_tutorial.md b/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/tf_sem_segm_model_conversion_tutorial.md
new file mode 100644
index 0000000000..bcf9749e2e
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_pytorch_tf_segmentation/tf_sem_segm_model_conversion_tutorial.md
@@ -0,0 +1,406 @@
+# Conversion of TensorFlow Segmentation Models and Launch with OpenCV {#tf_segm_tutorial_dnn_conversion}
+
+## Goals
+In this tutorial you will learn how to:
+* convert TensorFlow (TF) segmentation models
+* run converted TensorFlow model with OpenCV
+* obtain an evaluation of the TensorFlow and OpenCV DNN models
+
+We will explore the above-listed points by the example of the DeepLab architecture.
+
+## Introduction
+The key concepts involved in the transition pipeline of the [TensorFlow classification](https://link_to_cls_tutorial) and segmentation models with OpenCV API are almost equal excepting the phase of graph optimization. The initial step in conversion of TensorFlow models into cv.dnn.Net
+is obtaining the frozen TF model graph. Frozen graph defines the combination of the model graph structure with kept values of the required variables, for example, weights. Usually the frozen graph is saved in [protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) (```.pb```) files.
+To read the generated segmentation model ``.pb`` file with cv.dnn.readNetFromTensorflow, it is needed to modify the graph with TF [graph transform tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms).
+
+## Practice
+In this part we are going to cover the following points:
+1. create a TF classification model conversion pipeline and provide the inference
+2. evaluate and test TF classification models
+
+If you'd like merely to run evaluation or test model pipelines, the "Model Conversion Pipeline" tutorial part can be skipped.
+
+### Model Conversion Pipeline
+The code in this subchapter is located in the ``dnn_model_runner`` module and can be executed with the line:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.py_to_py_deeplab
+```
+
+TensorFlow segmentation models can be found in [TensorFlow Research Models](https://github.com/tensorflow/models/tree/master/research/#tensorflow-research-models) section, which contains the implementations of models on the basis of published research papers.
+We will retrieve the archive with the pre-trained TF DeepLabV3 from the below link:
+
+```
+http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz
+```
+
+The full frozen graph obtaining pipeline is described in ``deeplab_retrievement.py``:
+
+```python
+def get_deeplab_frozen_graph():
+    # define model path to download
+    models_url = 'http://download.tensorflow.org/models/'
+    mobilenetv2_voctrainval = 'deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz'
+
+    # construct model link to download
+    model_link = models_url + mobilenetv2_voctrainval
+
+    try:
+        urllib.request.urlretrieve(model_link, mobilenetv2_voctrainval)
+    except Exception:
+        print("TF DeepLabV3 was not retrieved: {}".format(model_link))
+        return
+
+    tf_model_tar = tarfile.open(mobilenetv2_voctrainval)
+
+    # iterate the obtained model archive
+    for model_tar_elem in tf_model_tar.getmembers():
+        # check whether the model archive contains frozen graph
+        if TF_FROZEN_GRAPH_NAME in os.path.basename(model_tar_elem.name):
+            # extract frozen graph
+            tf_model_tar.extract(model_tar_elem, FROZEN_GRAPH_PATH)
+
+    tf_model_tar.close()
+```
+
+After running this script:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.deeplab_retrievement
+```
+
+we will get ``frozen_inference_graph.pb`` in ``deeplab/deeplabv3_mnv2_pascal_trainval``.
+
+Before going to the network loading with OpenCV it is needed to optimize the extracted ``frozen_inference_graph.pb``.
+To optimize the graph we use TF ``TransformGraph`` with default parameters:
+
+```python
+DEFAULT_OPT_GRAPH_NAME = "optimized_frozen_inference_graph.pb"
+DEFAULT_INPUTS = "sub_7"
+DEFAULT_OUTPUTS = "ResizeBilinear_3"
+DEFAULT_TRANSFORMS = "remove_nodes(op=Identity)" \
+                     " merge_duplicate_nodes" \
+                     " strip_unused_nodes" \
+                     " fold_constants(ignore_errors=true)" \
+                     " fold_batch_norms" \
+                     " fold_old_batch_norms"
+
+
+def optimize_tf_graph(
+        in_graph,
+        out_graph=DEFAULT_OPT_GRAPH_NAME,
+        inputs=DEFAULT_INPUTS,
+        outputs=DEFAULT_OUTPUTS,
+        transforms=DEFAULT_TRANSFORMS,
+        is_manual=True,
+        was_optimized=True
+):
+    # ...
+
+    tf_opt_graph = TransformGraph(
+        tf_graph,
+        inputs,
+        outputs,
+        transforms
+    )
+```
+
+To run graph optimization process, execute the line:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.tf_graph_optimizer --in_graph deeplab/deeplabv3_mnv2_pascal_trainval/frozen_inference_graph.pb
+```
+
+As a result ``deeplab/deeplabv3_mnv2_pascal_trainval`` directory will contain ``optimized_frozen_inference_graph.pb``.
+
+After we have obtained the model graphs, let's examine the below-listed steps:
+1. read TF ``frozen_inference_graph.pb`` graph
+2. read optimized TF frozen graph with OpenCV API
+3. prepare input data
+4. provide inference
+5. get colored masks from predictions
+6. visualize results
+
+```python
+# get TF model graph from the obtained frozen graph
+deeplab_graph = read_deeplab_frozen_graph(deeplab_frozen_graph_path)
+
+# read DeepLab frozen graph with OpenCV API
+opencv_net = cv2.dnn.readNetFromTensorflow(opt_deeplab_frozen_graph_path)
+print("OpenCV model was successfully read. Model layers: \n", opencv_net.getLayerNames())
+
+# get processed image
+original_img_shape, tf_input_blob, opencv_input_img = get_processed_imgs("test_data/sem_segm/2007_000033.jpg")
+
+# obtain OpenCV DNN predictions
+opencv_prediction = get_opencv_dnn_prediction(opencv_net, opencv_input_img)
+
+# obtain TF model predictions
+tf_prediction = get_tf_dnn_prediction(deeplab_graph, tf_input_blob)
+
+# get PASCAL VOC classes and colors
+pascal_voc_classes, pascal_voc_colors = read_colors_info("test_data/sem_segm/pascal-classes.txt")
+
+# obtain colored segmentation masks
+opencv_colored_mask = get_colored_mask(original_img_shape, opencv_prediction, pascal_voc_colors)
+tf_colored_mask = get_tf_colored_mask(original_img_shape, tf_prediction, pascal_voc_colors)
+
+# obtain palette of PASCAL VOC colors
+color_legend = get_legend(pascal_voc_classes, pascal_voc_colors)
+
+cv2.imshow('TensorFlow Colored Mask', tf_colored_mask)
+cv2.imshow('OpenCV DNN Colored Mask', opencv_colored_mask)
+
+cv2.imshow('Color Legend', color_legend)
+```
+
+To provide the model inference we will use the below picture from the [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/) validation dataset:
+
+![PASCAL VOC img](images/2007_000033.jpg)
+
+The target segmented result is:
+
+![PASCAL VOC ground truth](images/2007_000033.png)
+
+For the PASCAL VOC colors decoding and its mapping with the predicted masks, we also need ``pascal-classes.txt`` file, which contains the full list of the PASCAL VOC classes and corresponding colors.
+
+Let's go deeper into each step by the example of pretrained TF DeepLabV3 MobileNetV2:
+
+* read TF ``frozen_inference_graph.pb`` graph :
+
+```python
+# init deeplab model graph
+model_graph = tf.Graph()
+
+# obtain
+with tf.io.gfile.GFile(frozen_graph_path, 'rb') as graph_file:
+    tf_model_graph = GraphDef()
+tf_model_graph.ParseFromString(graph_file.read())
+
+with model_graph.as_default():
+    tf.import_graph_def(tf_model_graph, name='')
+```
+
+* read optimized TF frozen graph with OpenCV API:
+
+```python
+# read DeepLab frozen graph with OpenCV API
+opencv_net = cv2.dnn.readNetFromTensorflow(opt_deeplab_frozen_graph_path)
+```
+
+* prepare input data with cv2.dnn.blobFromImage function:
+
+```python
+# read the image
+input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+input_img = input_img.astype(np.float32)
+
+# preprocess image for TF model input
+tf_preproc_img = cv2.resize(input_img, (513, 513))
+tf_preproc_img = cv2.cvtColor(tf_preproc_img, cv2.COLOR_BGR2RGB)
+
+# define preprocess parameters for OpenCV DNN
+mean = np.array([1.0, 1.0, 1.0]) * 127.5
+scale = 1 / 127.5
+
+# prepare input blob to fit the model input:
+# 1. subtract mean
+# 2. scale to set pixel values from 0 to 1
+input_blob = cv2.dnn.blobFromImage(
+    image=input_img,
+    scalefactor=scale,
+    size=(513, 513),  # img target size
+    mean=mean,
+    swapRB=True,  # BGR -> RGB
+    crop=False  # center crop
+)
+```
+
+Please, pay attention at the preprocessing order in the ``cv2.dnn.blobFromImage`` function. Firstly, the mean value is subtracted and only then pixel values are multiplied by the defined scale.
+Therefore, to reproduce TF image preprocessing pipeline, we multiply ``mean`` by ``127.5``.
+Another important point is image preprocessing for TF DeepLab. To pass the image into TF model we need only to construct an appropriate shape, the rest image preprocessing is described in [feature_extractor.py](https://github.com/tensorflow/models/blob/master/research/deeplab/core/feature_extractor.py) and will be invoked automatically.
+
+* provide OpenCV ``cv.dnn_Net`` inference:
+
+```python
+# set OpenCV DNN input
+opencv_net.setInput(preproc_img)
+
+# OpenCV DNN inference
+out = opencv_net.forward()
+print("OpenCV DNN segmentation prediction: \n")
+print("* shape: ", out.shape)
+
+# get IDs of predicted classes
+out_predictions = np.argmax(out[0], axis=0)
+```
+
+After the above code execution we will get the following output:
+
+```
+OpenCV DNN segmentation prediction:
+* shape:  (1, 21, 513, 513)
+
+```
+
+Each prediction channel out of 21, where 21 represents the number of PASCAL VOC classes, contains probabilities, which indicate how likely the pixel corresponds to the PASCAL VOC class.
+
+* provide TF model inference:
+
+```python
+preproc_img = np.expand_dims(preproc_img, 0)
+
+# init TF session
+tf_session = Session(graph=model_graph)
+
+input_tensor_name = "ImageTensor:0",
+output_tensor_name = "SemanticPredictions:0"
+
+# run inference
+out = tf_session.run(
+    output_tensor_name,
+    feed_dict={input_tensor_name: [preproc_img]}
+)
+
+print("TF segmentation model prediction: \n")
+print("* shape: ", out.shape)
+```
+
+TF inference results are the following:
+
+```
+TF segmentation model prediction:
+* shape:  (1, 513, 513)
+```
+
+TensorFlow prediction contains the indexes of corresponding PASCAL VOC classes.
+
+* transform OpenCV prediction into colored mask:
+
+```python
+mask_height = segm_mask.shape[0]
+mask_width = segm_mask.shape[1]
+
+img_height = original_img_shape[0]
+img_width = original_img_shape[1]
+
+# convert mask values into PASCAL VOC colors
+processed_mask = np.stack([colors[color_id] for color_id in segm_mask.flatten()])
+
+# reshape mask into 3-channel image
+processed_mask = processed_mask.reshape(mask_height, mask_width, 3)
+processed_mask = cv2.resize(processed_mask, (img_width, img_height), interpolation=cv2.INTER_NEAREST).astype(
+    np.uint8)
+
+# convert colored mask from BGR to RGB
+processed_mask = cv2.cvtColor(processed_mask, cv2.COLOR_BGR2RGB)
+```
+
+In this step we map the probabilities from segmentation masks with appropriate colors of the predicted classes. Let's have a look at the results:
+
+![Color Legend](images/colors_legend.png)
+
+![OpenCV Colored Mask](images/deeplab_opencv_colored_mask.png)
+
+* transform TF prediction into colored mask:
+
+```python
+colors = np.array(colors)
+processed_mask = colors[segm_mask[0]]
+
+img_height = original_img_shape[0]
+img_width = original_img_shape[1]
+
+processed_mask = cv2.resize(processed_mask, (img_width, img_height), interpolation=cv2.INTER_NEAREST).astype(
+    np.uint8)
+
+# convert colored mask from BGR to RGB for compatibility with PASCAL VOC colors
+processed_mask = cv2.cvtColor(processed_mask, cv2.COLOR_BGR2RGB)
+```
+
+The result is:
+
+![TF Colored Mask](images/deeplab_tf_colored_mask.png)
+
+As a result, we get two equal segmentation masks.
+
+### Evaluation of the Models
+
+The proposed in ``dnn/samples`` ``dnn_model_runner`` module allows to run the full evaluation pipeline on the PASCAL VOC dataset and test execution for the DeepLab MobileNet model.
+
+#### Evaluation Mode
+
+To below line represents running of the module in the evaluation mode:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.py_to_py_segm
+```
+
+The model will be read into OpenCV ``cv.dnn_Net`` object. Evaluation results of TF and OpenCV models  (pixel accuracy, mean IoU, inference time) will be written into the log file. Inference time values will be also depicted in a chart to generalize the obtained model information.
+
+Necessary evaluation configurations are defined in the [``test_config.py``](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py):
+
+```python
+@dataclass
+class TestSegmConfig:
+    frame_size: int = 500
+    img_root_dir: str = "./VOC2012"
+    img_dir: str = os.path.join(img_root_dir, "JPEGImages/")
+    img_segm_gt_dir: str = os.path.join(img_root_dir, "SegmentationClass/")
+    # reduced val: https://github.com/shelhamer/fcn.berkeleyvision.org/blob/master/data/pascal/seg11valid.txt
+    segm_val_file: str = os.path.join(img_root_dir, "ImageSets/Segmentation/seg11valid.txt")
+    colour_file_cls: str = os.path.join(img_root_dir, "ImageSets/Segmentation/pascal-classes.txt")
+```
+
+These values can be modified in accordance with chosen model pipeline.
+
+#### Test Mode
+
+The below line represents running of the module in the test mode, which provides the steps for the model inference:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.py_to_py_segm --test True --default_img_preprocess <True/False> --evaluate False
+```
+
+Here ``default_img_preprocess`` key defines whether you'd like to parametrize the model test process with some particular values or use the default values, for example, ``scale``, ``mean`` or ``std``.
+
+Test configuration is represented in [``test_config.py``](https://github.com/opencv/opencv/tree/master/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py) ``TestSegmModuleConfig`` class:
+
+```python
+@dataclass
+class TestSegmModuleConfig:
+    segm_test_data_dir: str = "test_data/sem_segm"
+    test_module_name: str = "segmentation"
+    test_module_path: str = "segmentation.py"
+    input_img: str = os.path.join(segm_test_data_dir, "2007_000033.jpg")
+    model: str = ""
+
+    frame_height: str = str(TestSegmConfig.frame_size)
+    frame_width: str = str(TestSegmConfig.frame_size)
+    scale: float = 1.0
+    mean: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+    std: List[float] = field(default_factory=list)
+    crop: bool = False
+    rgb: bool = True
+    classes: str = os.path.join(segm_test_data_dir, "pascal-classes.txt")
+```
+
+The default image preprocessing options are defined in ``default_preprocess_config.py``:
+
+```python
+tf_segm_input_blob = {
+    "scale": str(1 / 127.5),
+    "mean": ["127.5", "127.5", "127.5"],
+    "std": [],
+    "crop": "False",
+    "rgb": "True"
+}
+```
+
+The basis of the model testing is represented in ``samples/dnn/segmentation.py``.  ``segmentation.py`` can be executed autonomously with provided converted model in ``--input`` and populated parameters for ``cv2.dnn.blobFromImage``.
+
+To reproduce from scratch the described in "Model Conversion Pipeline" OpenCV steps with ``dnn_model_runner`` execute the below line:
+
+```
+python -m dnn_model_runner.dnn_conversion.tf.segmentation.py_to_py_segm --test True --default_img_preprocess True --evaluate False
+```
diff --git a/doc/tutorials/dnn/dnn_text_spotting/detect_test1.jpg b/doc/tutorials/dnn/dnn_text_spotting/detect_test1.jpg
new file mode 100644
index 0000000000..b154dfc4ec
Binary files /dev/null and b/doc/tutorials/dnn/dnn_text_spotting/detect_test1.jpg differ
diff --git a/doc/tutorials/dnn/dnn_text_spotting/detect_test2.jpg b/doc/tutorials/dnn/dnn_text_spotting/detect_test2.jpg
new file mode 100644
index 0000000000..a46dcc03a1
Binary files /dev/null and b/doc/tutorials/dnn/dnn_text_spotting/detect_test2.jpg differ
diff --git a/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
new file mode 100644
index 0000000000..5f28b6ce7a
--- /dev/null
+++ b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
@@ -0,0 +1,324 @@
+# High Level API: TextDetectionModel and TextRecognitionModel {#tutorial_dnn_text_spotting}
+
+@tableofcontents
+
+@prev_tutorial{tutorial_dnn_OCR}
+@next_tutorial{pytorch_cls_tutorial_dnn_conversion}
+
+|    |    |
+| -: | :- |
+| Original author | Wenqing Zhang |
+| Compatibility | OpenCV >= 4.5 |
+
+## Introduction
+In this tutorial, we will introduce the APIs for TextRecognitionModel and TextDetectionModel in detail.
+
+---
+#### TextRecognitionModel:
+
+In the current version, @ref cv::dnn::TextRecognitionModel only supports CNN+RNN+CTC based algorithms,
+and the greedy decoding method for CTC is provided.
+For more information, please refer to the [original paper](https://arxiv.org/abs/1507.05717)
+
+Before recognition, you should `setVocabulary` and `setDecodeType`.
+- "CTC-greedy", the output of the text recognition model should be a probability matrix.
+    The shape should be `(T, B, Dim)`, where
+    - `T` is the sequence length
+    - `B` is the batch size (only support `B=1` in inference)
+    - and `Dim` is the length of vocabulary +1('Blank' of CTC is at the index=0 of Dim).
+
+@ref cv::dnn::TextRecognitionModel::recognize() is the main function for text recognition.
+- The input image should be a cropped text image or an image with `roiRects`
+- Other decoding methods may supported in the future
+
+---
+
+#### TextDetectionModel:
+
+@ref cv::dnn::TextDetectionModel API provides these methods for text detection:
+- cv::dnn::TextDetectionModel::detect() returns the results in std::vector<std::vector<Point>> (4-points quadrangles)
+- cv::dnn::TextDetectionModel::detectTextRectangles() returns the results in std::vector<cv::RotatedRect> (RBOX-like)
+
+In the current version, @ref cv::dnn::TextDetectionModel supports these algorithms:
+- use @ref cv::dnn::TextDetectionModel_DB with "DB" models
+- and use @ref cv::dnn::TextDetectionModel_EAST with "EAST" models
+
+The following provided pretrained models are variants of DB (w/o deformable convolution),
+and the performance can be referred to the Table.1 in the [paper]((https://arxiv.org/abs/1911.08947)).
+For more information, please refer to the [official code](https://github.com/MhLiao/DB)
+
+---
+
+You can train your own model with more data, and convert it into ONNX format.
+We encourage you to add new algorithms to these APIs.
+
+
+## Pretrained Models
+
+#### TextRecognitionModel:
+
+```
+crnn.onnx:
+url: https://drive.google.com/uc?export=dowload&id=1ooaLR-rkTl8jdpGy1DoQs0-X0lQsB6Fj
+sha: 270d92c9ccb670ada2459a25977e8deeaf8380d3,
+alphabet_36.txt: https://drive.google.com/uc?export=dowload&id=1oPOYx5rQRp8L6XQciUwmwhMCfX0KyO4b
+parameter setting: -rgb=0;
+description: The classification number of this model is 36 (0~9 + a~z).
+             The training dataset is MJSynth.
+
+crnn_cs.onnx:
+url: https://drive.google.com/uc?export=dowload&id=12diBsVJrS9ZEl6BNUiRp9s0xPALBS7kt
+sha: a641e9c57a5147546f7a2dbea4fd322b47197cd5
+alphabet_94.txt: https://drive.google.com/uc?export=dowload&id=1oKXxXKusquimp7XY1mFvj9nwLzldVgBR
+parameter setting: -rgb=1;
+description: The classification number of this model is 94 (0~9 + a~z + A~Z + punctuations).
+             The training datasets are MJsynth and SynthText.
+
+crnn_cs_CN.onnx:
+url: https://drive.google.com/uc?export=dowload&id=1is4eYEUKH7HR7Gl37Sw4WPXx6Ir8oQEG
+sha: 3940942b85761c7f240494cf662dcbf05dc00d14
+alphabet_3944.txt: https://drive.google.com/uc?export=dowload&id=18IZUUdNzJ44heWTndDO6NNfIpJMmN-ul
+parameter setting: -rgb=1;
+description: The classification number of this model is 3944 (0~9 + a~z + A~Z + Chinese characters + special characters).
+             The training dataset is ReCTS (https://rrc.cvc.uab.es/?ch=12).
+```
+
+More models can be found in [here](https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing),
+which are taken from [clovaai](https://github.com/clovaai/deep-text-recognition-benchmark).
+You can train more models by [CRNN](https://github.com/meijieru/crnn.pytorch), and convert models by `torch.onnx.export`.
+
+#### TextDetectionModel:
+
+```
+- DB_IC15_resnet50.onnx:
+url: https://drive.google.com/uc?export=dowload&id=17_ABp79PlFt9yPCxSaarVc_DKTmrSGGf
+sha: bef233c28947ef6ec8c663d20a2b326302421fa3
+recommended parameter setting: -inputHeight=736, -inputWidth=1280;
+description: This model is trained on ICDAR2015, so it can only detect English text instances.
+
+- DB_IC15_resnet18.onnx:
+url: https://drive.google.com/uc?export=dowload&id=1sZszH3pEt8hliyBlTmB-iulxHP1dCQWV
+sha: 19543ce09b2efd35f49705c235cc46d0e22df30b
+recommended parameter setting: -inputHeight=736, -inputWidth=1280;
+description: This model is trained on ICDAR2015, so it can only detect English text instances.
+
+- DB_TD500_resnet50.onnx:
+url: https://drive.google.com/uc?export=dowload&id=19YWhArrNccaoSza0CfkXlA8im4-lAGsR
+sha: 1b4dd21a6baa5e3523156776970895bd3db6960a
+recommended parameter setting: -inputHeight=736, -inputWidth=736;
+description: This model is trained on MSRA-TD500, so it can detect both English and Chinese text instances.
+
+- DB_TD500_resnet18.onnx:
+url: https://drive.google.com/uc?export=dowload&id=1vY_KsDZZZb_svd5RT6pjyI8BS1nPbBSX
+sha: 8a3700bdc13e00336a815fc7afff5dcc1ce08546
+recommended parameter setting: -inputHeight=736, -inputWidth=736;
+description: This model is trained on MSRA-TD500, so it can detect both English and Chinese text instances.
+
+```
+
+We will release more models of DB [here](https://drive.google.com/drive/folders/1qzNCHfUJOS0NEUOIKn69eCtxdlNPpWbq?usp=sharing) in the future.
+
+```
+- EAST:
+Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
+This model is based on https://github.com/argman/EAST
+```
+
+## Images for Testing
+
+```
+Text Recognition:
+url: https://drive.google.com/uc?export=dowload&id=1nMcEy68zDNpIlqAn6xCk_kYcUTIeSOtN
+sha: 89205612ce8dd2251effa16609342b69bff67ca3
+
+Text Detection:
+url: https://drive.google.com/uc?export=dowload&id=149tAhIcvfCYeyufRoZ9tmc2mZDKE_XrF
+sha: ced3c03fb7f8d9608169a913acf7e7b93e07109b
+```
+
+## Example for Text Recognition
+
+Step1. Loading images and models with a vocabulary
+
+```cpp
+    // Load a cropped text line image
+    // you can find cropped images for testing in "Images for Testing"
+    int rgb = IMREAD_COLOR; // This should be changed according to the model input requirement.
+    Mat image = imread("path/to/text_rec_test.png", rgb);
+
+    // Load models weights
+    TextRecognitionModel model("path/to/crnn_cs.onnx");
+
+    // The decoding method
+    // more methods will be supported in future
+    model.setDecodeType("CTC-greedy");
+
+    // Load vocabulary
+    // vocabulary should be changed according to the text recognition model
+    std::ifstream vocFile;
+    vocFile.open("path/to/alphabet_94.txt");
+    CV_Assert(vocFile.is_open());
+    String vocLine;
+    std::vector<String> vocabulary;
+    while (std::getline(vocFile, vocLine)) {
+        vocabulary.push_back(vocLine);
+    }
+    model.setVocabulary(vocabulary);
+```
+
+Step2. Setting Parameters
+
+```cpp
+    // Normalization parameters
+    double scale = 1.0 / 127.5;
+    Scalar mean = Scalar(127.5, 127.5, 127.5);
+
+    // The input shape
+    Size inputSize = Size(100, 32);
+
+    model.setInputParams(scale, inputSize, mean);
+```
+Step3. Inference
+```cpp
+    std::string recognitionResult = recognizer.recognize(image);
+    std::cout << "'" << recognitionResult << "'" << std::endl;
+```
+
+Input image:
+
+![Picture example](text_rec_test.png)
+
+Output:
+```
+'welcome'
+```
+
+
+## Example for Text Detection
+
+Step1. Loading images and models
+```cpp
+    // Load an image
+    // you can find some images for testing in "Images for Testing"
+    Mat frame = imread("/path/to/text_det_test.png");
+```
+
+Step2.a Setting Parameters (DB)
+```cpp
+    // Load model weights
+    TextDetectionModel_DB model("/path/to/DB_TD500_resnet50.onnx");
+
+    // Post-processing parameters
+    float binThresh = 0.3;
+    float polyThresh = 0.5;
+    uint maxCandidates = 200;
+    double unclipRatio = 2.0;
+    model.setBinaryThreshold(binThresh)
+         .setPolygonThreshold(polyThresh)
+         .setMaxCandidates(maxCandidates)
+         .setUnclipRatio(unclipRatio)
+    ;
+
+    // Normalization parameters
+    double scale = 1.0 / 255.0;
+    Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
+
+    // The input shape
+    Size inputSize = Size(736, 736);
+
+    model.setInputParams(scale, inputSize, mean);
+```
+
+Step2.b Setting Parameters (EAST)
+```cpp
+    TextDetectionModel_EAST model("EAST.pb");
+
+    float confThreshold = 0.5;
+    float nmsThreshold = 0.4;
+    model.setConfidenceThreshold(confThresh)
+         .setNMSThreshold(nmsThresh)
+    ;
+
+    double detScale = 1.0;
+    Size detInputSize = Size(320, 320);
+    Scalar detMean = Scalar(123.68, 116.78, 103.94);
+    bool swapRB = true;
+    model.setInputParams(detScale, detInputSize, detMean, swapRB);
+```
+
+
+Step3. Inference
+```cpp
+    std::vector<std::vector<Point>> detResults;
+    model.detect(detResults);
+
+    // Visualization
+    polylines(frame, results, true, Scalar(0, 255, 0), 2);
+    imshow("Text Detection", image);
+    waitKey();
+```
+
+Output:
+
+![Picture example](text_det_test_results.jpg)
+
+## Example for Text Spotting
+
+After following the steps above, it is easy to get the detection results of an input image.
+Then, you can do transformation and crop text images for recognition.
+For more information, please refer to **Detailed Sample**
+```cpp
+    // Transform and Crop
+    Mat cropped;
+    fourPointsTransform(recInput, vertices, cropped);
+
+    String recResult = recognizer.recognize(cropped);
+```
+
+Output Examples:
+
+![Picture example](detect_test1.jpg)
+
+![Picture example](detect_test2.jpg)
+
+## Source Code
+The [source code](https://github.com/opencv/opencv/blob/master/modules/dnn/src/model.cpp)
+of these APIs can be found in the DNN module.
+
+## Detailed Sample
+For more information, please refer to:
+- [samples/dnn/scene_text_recognition.cpp](https://github.com/opencv/opencv/blob/master/samples/dnn/scene_text_recognition.cpp)
+- [samples/dnn/scene_text_detection.cpp](https://github.com/opencv/opencv/blob/master/samples/dnn/scene_text_detection.cpp)
+- [samples/dnn/text_detection.cpp](https://github.com/opencv/opencv/blob/master/samples/dnn/text_detection.cpp)
+- [samples/dnn/scene_text_spotting.cpp](https://github.com/opencv/opencv/blob/master/samples/dnn/scene_text_spotting.cpp)
+
+#### Test with an image
+Examples:
+```bash
+example_dnn_scene_text_recognition -mp=path/to/crnn_cs.onnx -i=path/to/an/image -rgb=1 -vp=/path/to/alphabet_94.txt
+example_dnn_scene_text_detection -mp=path/to/DB_TD500_resnet50.onnx -i=path/to/an/image -ih=736 -iw=736
+example_dnn_scene_text_spotting -dmp=path/to/DB_IC15_resnet50.onnx -rmp=path/to/crnn_cs.onnx -i=path/to/an/image -iw=1280 -ih=736 -rgb=1 -vp=/path/to/alphabet_94.txt
+example_dnn_text_detection -dmp=path/to/EAST.pb -rmp=path/to/crnn_cs.onnx -i=path/to/an/image -rgb=1 -vp=path/to/alphabet_94.txt
+```
+
+#### Test on public datasets
+Text Recognition:
+
+The download link for testing images can be found in the **Images for Testing**
+
+
+Examples:
+```bash
+example_dnn_scene_text_recognition -mp=path/to/crnn.onnx -e=true -edp=path/to/evaluation_data_rec -vp=/path/to/alphabet_36.txt -rgb=0
+example_dnn_scene_text_recognition -mp=path/to/crnn_cs.onnx -e=true -edp=path/to/evaluation_data_rec -vp=/path/to/alphabet_94.txt -rgb=1
+```
+
+Text Detection:
+
+The download links for testing images can be found in the **Images for Testing**
+
+Examples:
+```bash
+example_dnn_scene_text_detection -mp=path/to/DB_TD500_resnet50.onnx -e=true -edp=path/to/evaluation_data_det/TD500 -ih=736 -iw=736
+example_dnn_scene_text_detection -mp=path/to/DB_IC15_resnet50.onnx -e=true -edp=path/to/evaluation_data_det/IC15 -ih=736 -iw=1280
+```
diff --git a/doc/tutorials/dnn/dnn_text_spotting/text_det_test_results.jpg b/doc/tutorials/dnn/dnn_text_spotting/text_det_test_results.jpg
new file mode 100644
index 0000000000..173840f729
Binary files /dev/null and b/doc/tutorials/dnn/dnn_text_spotting/text_det_test_results.jpg differ
diff --git a/doc/tutorials/dnn/dnn_text_spotting/text_rec_test.png b/doc/tutorials/dnn/dnn_text_spotting/text_rec_test.png
new file mode 100644
index 0000000000..c3226376e4
Binary files /dev/null and b/doc/tutorials/dnn/dnn_text_spotting/text_rec_test.png differ
diff --git a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
index 1552d4e654..0973396db5 100644
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@@ -1,9 +1,16 @@
 YOLO DNNs  {#tutorial_dnn_yolo}
 ===============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_dnn_android}
 @next_tutorial{tutorial_dnn_javascript}
 
+|    |    |
+| -: | :- |
+| Original author | Alessandro de Oliveira Faria |
+| Compatibility | OpenCV >= 3.3.1 |
+
 Introduction
 ------------
 
diff --git a/doc/tutorials/dnn/table_of_content_dnn.markdown b/doc/tutorials/dnn/table_of_content_dnn.markdown
index 0a66d04ee4..0d5e43ee11 100644
--- a/doc/tutorials/dnn/table_of_content_dnn.markdown
+++ b/doc/tutorials/dnn/table_of_content_dnn.markdown
@@ -2,81 +2,23 @@ Deep Neural Networks (dnn module) {#tutorial_table_of_content_dnn}
 =====================================
 
 -   @subpage tutorial_dnn_googlenet
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 3.3
-
-    *Author:* Vitaliy Lyudvichenko
-
-    In this tutorial you will learn how to use opencv_dnn module for image classification by using GoogLeNet trained network from Caffe model zoo.
-
 -   @subpage tutorial_dnn_halide
-
-    *Languages:* Halide
-
-    *Compatibility:* \> OpenCV 3.3
-
-    *Author:* Dmitry Kurtaev
-
-    This tutorial guidelines how to run your models in OpenCV deep learning module using Halide language backend.
-
 -   @subpage tutorial_dnn_halide_scheduling
-
-    *Languages:* Halide
-
-    *Compatibility:* \> OpenCV 3.3
-
-    *Author:* Dmitry Kurtaev
-
-    In this tutorial we describe the ways to schedule your networks using Halide backend in OpenCV deep learning module.
-
 -   @subpage tutorial_dnn_android
-
-    *Languages:* Java
-
-    *Compatibility:* \> OpenCV 3.3
-
-    *Author:* Dmitry Kurtaev
-
-    This tutorial will show you how to run deep learning model using OpenCV on Android device.
-
 -   @subpage tutorial_dnn_yolo
-
-    *Languages:* C++, Python
-
-    *Compatibility:* \> OpenCV 3.3.1
-
-    *Author:* Alessandro de Oliveira Faria
-
-    In this tutorial you will learn how to use opencv_dnn module using yolo_object_detection with device capture, video file or image.
-
 -   @subpage tutorial_dnn_javascript
-
-    *Languages:* JavaScript
-
-    *Compatibility:* \> OpenCV 3.3.1
-
-    *Author:* Dmitry Kurtaev
-
-    In this tutorial we'll run deep learning models in browser using OpenCV.js.
-
 -   @subpage tutorial_dnn_custom_layers
-
-    *Languages:* C++, Python
-
-    *Compatibility:* \> OpenCV 3.4.1
-
-    *Author:* Dmitry Kurtaev
-
-    How to define custom layers to import networks.
-
 -   @subpage tutorial_dnn_OCR
+-   @subpage tutorial_dnn_text_spotting
 
-    *Languages:* C++
+#### PyTorch models with OpenCV
+In this section you will find the guides, which describe how to run classification, segmentation and detection PyTorch DNN models with OpenCV.
+-   @subpage pytorch_cls_tutorial_dnn_conversion
+-   @subpage pytorch_cls_c_tutorial_dnn_conversion
+-   @subpage pytorch_segm_tutorial_dnn_conversion
 
-    *Compatibility:* \> OpenCV 4.3
-
-    *Author:* Zihao Mu
-
-    In this tutorial you will learn how to use opencv_dnn module using custom OCR models.
\ No newline at end of file
+#### TensorFlow models with OpenCV
+In this section you will find the guides, which describe how to run classification, segmentation and detection TensorFlow DNN models with OpenCV.
+-   @subpage tf_cls_tutorial_dnn_conversion
+-   @subpage tf_det_tutorial_dnn_conversion
+-   @subpage tf_segm_tutorial_dnn_conversion
diff --git a/doc/tutorials/features2d/akaze_matching/akaze_matching.markdown b/doc/tutorials/features2d/akaze_matching/akaze_matching.markdown
index 0635dfc4f6..9cb920ff1b 100644
--- a/doc/tutorials/features2d/akaze_matching/akaze_matching.markdown
+++ b/doc/tutorials/features2d/akaze_matching/akaze_matching.markdown
@@ -1,9 +1,16 @@
 AKAZE local features matching {#tutorial_akaze_matching}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_detection_of_planar_objects}
 @next_tutorial{tutorial_akaze_tracking}
 
+|    |    |
+| -: | :- |
+| Original author | Fedor Morozov |
+| Compatibility | OpenCV >= 3.0 |
+
 Introduction
 ------------
 
diff --git a/doc/tutorials/features2d/akaze_tracking/akaze_tracking.markdown b/doc/tutorials/features2d/akaze_tracking/akaze_tracking.markdown
index 58071ffd42..dd23957d5d 100644
--- a/doc/tutorials/features2d/akaze_tracking/akaze_tracking.markdown
+++ b/doc/tutorials/features2d/akaze_tracking/akaze_tracking.markdown
@@ -1,9 +1,16 @@
 AKAZE and ORB planar tracking {#tutorial_akaze_tracking}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_akaze_matching}
 @next_tutorial{tutorial_homography}
 
+|    |    |
+| -: | :- |
+| Original author | Fedor Morozov |
+| Compatibility | OpenCV >= 3.0 |
+
 Introduction
 ------------
 
diff --git a/doc/tutorials/features2d/detection_of_planar_objects/detection_of_planar_objects.markdown b/doc/tutorials/features2d/detection_of_planar_objects/detection_of_planar_objects.markdown
index 9febdb7acd..2d19e3387e 100644
--- a/doc/tutorials/features2d/detection_of_planar_objects/detection_of_planar_objects.markdown
+++ b/doc/tutorials/features2d/detection_of_planar_objects/detection_of_planar_objects.markdown
@@ -1,9 +1,15 @@
 Detection of planar objects {#tutorial_detection_of_planar_objects}
 ===========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_feature_homography}
 @next_tutorial{tutorial_akaze_matching}
 
+|    |    |
+| -: | :- |
+| Original author | Victor Eruhimov |
+| Compatibility | OpenCV >= 3.0 |
 
 The goal of this tutorial is to learn how to use *features2d* and *calib3d* modules for detecting
 known planar objects in scenes.
diff --git a/doc/tutorials/features2d/feature_description/feature_description.markdown b/doc/tutorials/features2d/feature_description/feature_description.markdown
index 70a30096f5..1b77b6a624 100644
--- a/doc/tutorials/features2d/feature_description/feature_description.markdown
+++ b/doc/tutorials/features2d/feature_description/feature_description.markdown
@@ -1,9 +1,16 @@
 Feature Description {#tutorial_feature_description}
 ===================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_feature_detection}
 @next_tutorial{tutorial_feature_flann_matcher}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/feature_detection/feature_detection.markdown b/doc/tutorials/features2d/feature_detection/feature_detection.markdown
index a22ef90520..0d52877318 100644
--- a/doc/tutorials/features2d/feature_detection/feature_detection.markdown
+++ b/doc/tutorials/features2d/feature_detection/feature_detection.markdown
@@ -1,9 +1,16 @@
 Feature Detection {#tutorial_feature_detection}
 =================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_corner_subpixels}
 @next_tutorial{tutorial_feature_description}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.markdown b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.markdown
index 2e5f12c922..1416604aa3 100644
--- a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.markdown
+++ b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.markdown
@@ -1,9 +1,16 @@
 Feature Matching with FLANN {#tutorial_feature_flann_matcher}
 ===========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_feature_description}
 @next_tutorial{tutorial_feature_homography}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/feature_homography/feature_homography.markdown b/doc/tutorials/features2d/feature_homography/feature_homography.markdown
index b2d23435eb..4b75c7f485 100644
--- a/doc/tutorials/features2d/feature_homography/feature_homography.markdown
+++ b/doc/tutorials/features2d/feature_homography/feature_homography.markdown
@@ -1,9 +1,16 @@
 Features2D + Homography to find a known object {#tutorial_feature_homography}
 ==============================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_feature_flann_matcher}
 @next_tutorial{tutorial_detection_of_planar_objects}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/homography/homography.markdown b/doc/tutorials/features2d/homography/homography.markdown
index 960511dd3d..10690ddf8d 100644
--- a/doc/tutorials/features2d/homography/homography.markdown
+++ b/doc/tutorials/features2d/homography/homography.markdown
@@ -1,8 +1,14 @@
 Basic concepts of the homography explained with code {#tutorial_homography}
 ====================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_akaze_tracking}
 
+|    |    |
+| -: | :- |
+| Compatibility | OpenCV >= 3.0 |
+
 @tableofcontents
 
 Introduction {#tutorial_homography_Introduction}
diff --git a/doc/tutorials/features2d/images/AKAZE_Match_Tutorial_Cover.png b/doc/tutorials/features2d/images/AKAZE_Match_Tutorial_Cover.png
deleted file mode 100644
index fdf2007ba2..0000000000
Binary files a/doc/tutorials/features2d/images/AKAZE_Match_Tutorial_Cover.png and /dev/null differ
diff --git a/doc/tutorials/features2d/images/AKAZE_Tracking_Tutorial_Cover.png b/doc/tutorials/features2d/images/AKAZE_Tracking_Tutorial_Cover.png
deleted file mode 100644
index bb3272c96b..0000000000
Binary files a/doc/tutorials/features2d/images/AKAZE_Tracking_Tutorial_Cover.png and /dev/null differ
diff --git a/doc/tutorials/features2d/images/Feature_Description_Tutorial_Cover.jpg b/doc/tutorials/features2d/images/Feature_Description_Tutorial_Cover.jpg
deleted file mode 100644
index 975caa62ef..0000000000
Binary files a/doc/tutorials/features2d/images/Feature_Description_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/Feature_Detection_Tutorial_Cover.jpg b/doc/tutorials/features2d/images/Feature_Detection_Tutorial_Cover.jpg
deleted file mode 100644
index cca9a2b438..0000000000
Binary files a/doc/tutorials/features2d/images/Feature_Detection_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/Feature_Flann_Matcher_Tutorial_Cover.jpg b/doc/tutorials/features2d/images/Feature_Flann_Matcher_Tutorial_Cover.jpg
deleted file mode 100644
index e3f66fa0d1..0000000000
Binary files a/doc/tutorials/features2d/images/Feature_Flann_Matcher_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/Feature_Homography_Tutorial_Cover.jpg b/doc/tutorials/features2d/images/Feature_Homography_Tutorial_Cover.jpg
deleted file mode 100644
index d509cd9eb7..0000000000
Binary files a/doc/tutorials/features2d/images/Feature_Homography_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/detection_of_planar_objects.png b/doc/tutorials/features2d/images/detection_of_planar_objects.png
deleted file mode 100644
index 92de70cfdc..0000000000
Binary files a/doc/tutorials/features2d/images/detection_of_planar_objects.png and /dev/null differ
diff --git a/doc/tutorials/features2d/images/trackingmotion/Corner_Subpixeles_Cover.jpg b/doc/tutorials/features2d/images/trackingmotion/Corner_Subpixeles_Cover.jpg
deleted file mode 100644
index 61ec8d1d89..0000000000
Binary files a/doc/tutorials/features2d/images/trackingmotion/Corner_Subpixeles_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/trackingmotion/Generic_Corner_Detector_Cover.jpg b/doc/tutorials/features2d/images/trackingmotion/Generic_Corner_Detector_Cover.jpg
deleted file mode 100644
index 89fc7bef7c..0000000000
Binary files a/doc/tutorials/features2d/images/trackingmotion/Generic_Corner_Detector_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/trackingmotion/Harris_Detector_Cover.jpg b/doc/tutorials/features2d/images/trackingmotion/Harris_Detector_Cover.jpg
deleted file mode 100644
index bc4d816e32..0000000000
Binary files a/doc/tutorials/features2d/images/trackingmotion/Harris_Detector_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/images/trackingmotion/Shi_Tomasi_Detector_Cover.jpg b/doc/tutorials/features2d/images/trackingmotion/Shi_Tomasi_Detector_Cover.jpg
deleted file mode 100644
index e0ee608d9c..0000000000
Binary files a/doc/tutorials/features2d/images/trackingmotion/Shi_Tomasi_Detector_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/features2d/table_of_content_features2d.markdown b/doc/tutorials/features2d/table_of_content_features2d.markdown
index f42e2a571c..29c99018fc 100644
--- a/doc/tutorials/features2d/table_of_content_features2d.markdown
+++ b/doc/tutorials/features2d/table_of_content_features2d.markdown
@@ -1,128 +1,15 @@
 2D Features framework (feature2d module) {#tutorial_table_of_content_features2d}
 =========================================
 
-Learn about how to use the feature points detectors, descriptors and matching framework found inside
-OpenCV.
-
 -   @subpage tutorial_harris_detector
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Why is it a good idea to track corners? We learn how to use the Harris method to detect
-    corners.
-
 -   @subpage tutorial_good_features_to_track
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we use an improved method to detect corners more accurately.
-
 -   @subpage tutorial_generic_corner_detector
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Here you will learn how to use OpenCV functions to make your personalized corner detector!
-
-    *Languages:* C++, Java, Python
-
 -   @subpage tutorial_corner_subpixels
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Is pixel resolution enough? Here we learn a simple method to improve our corner location accuracy.
-
 -   @subpage tutorial_feature_detection
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    In this tutorial, you will use *features2d* to detect interest points.
-
 -   @subpage tutorial_feature_description
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    In this tutorial, you will use *features2d* to calculate feature vectors.
-
 -   @subpage tutorial_feature_flann_matcher
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    In this tutorial, you will use the FLANN library to make a fast matching.
-
 -   @subpage tutorial_feature_homography
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    In this tutorial, you will use *features2d* and *calib3d* to detect an object in a scene.
-
 -   @subpage tutorial_detection_of_planar_objects
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Victor Eruhimov
-
-    You will use *features2d* and *calib3d* modules for detecting known planar objects in
-    scenes.
-
 -   @subpage tutorial_akaze_matching
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 3.0
-
-    *Author:* Fedor Morozov
-
-    Using *AKAZE* local features to find correspondence between two images.
-
 -   @subpage tutorial_akaze_tracking
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 3.0
-
-    *Author:* Fedor Morozov
-
-    Using *AKAZE* and *ORB* for planar object tracking.
-
 -   @subpage tutorial_homography
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 3.0
-
-    This tutorial will explain the basic concepts of the homography with some
-    demonstration codes.
diff --git a/doc/tutorials/features2d/trackingmotion/corner_subpixels/corner_subpixels.markdown b/doc/tutorials/features2d/trackingmotion/corner_subpixels/corner_subpixels.markdown
index a9316b732d..e43fc3b344 100644
--- a/doc/tutorials/features2d/trackingmotion/corner_subpixels/corner_subpixels.markdown
+++ b/doc/tutorials/features2d/trackingmotion/corner_subpixels/corner_subpixels.markdown
@@ -1,9 +1,16 @@
 Detecting corners location in subpixels {#tutorial_corner_subpixels}
 =======================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_generic_corner_detector}
 @next_tutorial{tutorial_feature_detection}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/trackingmotion/generic_corner_detector/generic_corner_detector.markdown b/doc/tutorials/features2d/trackingmotion/generic_corner_detector/generic_corner_detector.markdown
index 6082b9b91d..def95717b4 100644
--- a/doc/tutorials/features2d/trackingmotion/generic_corner_detector/generic_corner_detector.markdown
+++ b/doc/tutorials/features2d/trackingmotion/generic_corner_detector/generic_corner_detector.markdown
@@ -1,9 +1,15 @@
 Creating your own corner detector {#tutorial_generic_corner_detector}
 =================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_good_features_to_track}
 @next_tutorial{tutorial_corner_subpixels}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
 
 Goal
 ----
diff --git a/doc/tutorials/features2d/trackingmotion/good_features_to_track/good_features_to_track.markdown b/doc/tutorials/features2d/trackingmotion/good_features_to_track/good_features_to_track.markdown
index 7e8cf1157e..19023c24b5 100644
--- a/doc/tutorials/features2d/trackingmotion/good_features_to_track/good_features_to_track.markdown
+++ b/doc/tutorials/features2d/trackingmotion/good_features_to_track/good_features_to_track.markdown
@@ -1,9 +1,16 @@
 Shi-Tomasi corner detector {#tutorial_good_features_to_track}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_harris_detector}
 @next_tutorial{tutorial_generic_corner_detector}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/features2d/trackingmotion/harris_detector/harris_detector.markdown b/doc/tutorials/features2d/trackingmotion/harris_detector/harris_detector.markdown
index f0b32683ce..1ed9cd51ac 100644
--- a/doc/tutorials/features2d/trackingmotion/harris_detector/harris_detector.markdown
+++ b/doc/tutorials/features2d/trackingmotion/harris_detector/harris_detector.markdown
@@ -1,8 +1,15 @@
 Harris corner detector {#tutorial_harris_detector}
 ======================
 
+@tableofcontents
+
 @next_tutorial{tutorial_good_features_to_track}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/gapi/anisotropic_segmentation/porting_anisotropic_segmentation.markdown b/doc/tutorials/gapi/anisotropic_segmentation/porting_anisotropic_segmentation.markdown
index 2912c6fba5..fa25c2b0b3 100644
--- a/doc/tutorials/gapi/anisotropic_segmentation/porting_anisotropic_segmentation.markdown
+++ b/doc/tutorials/gapi/anisotropic_segmentation/porting_anisotropic_segmentation.markdown
@@ -1,5 +1,8 @@
 # Porting anisotropic image segmentation on G-API {#tutorial_gapi_anisotropic_segmentation}
 
+@prev_tutorial{tutorial_gapi_interactive_face_detection}
+@next_tutorial{tutorial_gapi_face_beautification}
+
 [TOC]
 
 # Introduction {#gapi_anisotropic_intro}
diff --git a/doc/tutorials/gapi/face_beautification/face_beautification.markdown b/doc/tutorials/gapi/face_beautification/face_beautification.markdown
index 9e56db0a54..1ceb416c99 100644
--- a/doc/tutorials/gapi/face_beautification/face_beautification.markdown
+++ b/doc/tutorials/gapi/face_beautification/face_beautification.markdown
@@ -1,5 +1,7 @@
 # Implementing a face beautification algorithm with G-API {#tutorial_gapi_face_beautification}
 
+@prev_tutorial{tutorial_gapi_anisotropic_segmentation}
+
 [TOC]
 
 # Introduction {#gapi_fb_intro}
diff --git a/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown b/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown
index e5ca466da7..6f8b03bb61 100644
--- a/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown
+++ b/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown
@@ -1,5 +1,7 @@
 # Face analytics pipeline with G-API {#tutorial_gapi_interactive_face_detection}
 
+@next_tutorial{tutorial_gapi_anisotropic_segmentation}
+
 [TOC]
 
 # Overview {#gapi_ifd_intro}
diff --git a/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown b/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
index 74840efbc2..60e136fc6d 100644
--- a/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
+++ b/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
@@ -1,6 +1,9 @@
 @cond CUDA_MODULES
 Similarity check (PNSR and SSIM) on the GPU {#tutorial_gpu_basics_similarity}
 ===========================================
+
+@tableofcontents
+
 @todo update this tutorial
 
 @next_tutorial{tutorial_gpu_thrust_interop}
diff --git a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
index c930f883a3..b5d79ab0bc 100644
--- a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
+++ b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
@@ -2,6 +2,8 @@
 Using a cv::cuda::GpuMat with thrust {#tutorial_gpu_thrust_interop}
 ===========================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_gpu_basics_similarity}
 
 Goal
diff --git a/doc/tutorials/highgui/images/Adding_Trackbars_Tutorial_Cover.jpg b/doc/tutorials/highgui/images/Adding_Trackbars_Tutorial_Cover.jpg
deleted file mode 100644
index e914cab1d5..0000000000
Binary files a/doc/tutorials/highgui/images/Adding_Trackbars_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/highgui/table_of_content_highgui.markdown b/doc/tutorials/highgui/table_of_content_highgui.markdown
deleted file mode 100644
index fb5a343664..0000000000
--- a/doc/tutorials/highgui/table_of_content_highgui.markdown
+++ /dev/null
@@ -1,14 +0,0 @@
-High Level GUI and Media (highgui module) {#tutorial_table_of_content_highgui}
-=========================================
-
-This section contains tutorials about how to use the built-in graphical user interface of the library.
-
--   @subpage tutorial_trackbar
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    We will learn how to add a Trackbar to our applications
diff --git a/doc/tutorials/images/calib3d.jpg b/doc/tutorials/images/calib3d.jpg
deleted file mode 100644
index c5226a9a1d..0000000000
Binary files a/doc/tutorials/images/calib3d.jpg and /dev/null differ
diff --git a/doc/tutorials/images/core.jpg b/doc/tutorials/images/core.jpg
deleted file mode 100644
index 6fe819bd52..0000000000
Binary files a/doc/tutorials/images/core.jpg and /dev/null differ
diff --git a/doc/tutorials/images/feature2D.jpg b/doc/tutorials/images/feature2D.jpg
deleted file mode 100644
index 6744de0610..0000000000
Binary files a/doc/tutorials/images/feature2D.jpg and /dev/null differ
diff --git a/doc/tutorials/images/general.jpg b/doc/tutorials/images/general.jpg
deleted file mode 100644
index 95829d9c64..0000000000
Binary files a/doc/tutorials/images/general.jpg and /dev/null differ
diff --git a/doc/tutorials/images/gpu.jpg b/doc/tutorials/images/gpu.jpg
deleted file mode 100644
index 4cc053895c..0000000000
Binary files a/doc/tutorials/images/gpu.jpg and /dev/null differ
diff --git a/doc/tutorials/images/highgui.jpg b/doc/tutorials/images/highgui.jpg
deleted file mode 100644
index ada65fcb03..0000000000
Binary files a/doc/tutorials/images/highgui.jpg and /dev/null differ
diff --git a/doc/tutorials/images/imgproc.jpg b/doc/tutorials/images/imgproc.jpg
deleted file mode 100644
index ad7dafb0b7..0000000000
Binary files a/doc/tutorials/images/imgproc.jpg and /dev/null differ
diff --git a/doc/tutorials/images/introduction.jpg b/doc/tutorials/images/introduction.jpg
deleted file mode 100644
index 19a9284785..0000000000
Binary files a/doc/tutorials/images/introduction.jpg and /dev/null differ
diff --git a/doc/tutorials/images/ml.jpg b/doc/tutorials/images/ml.jpg
deleted file mode 100644
index 40acfcfbfd..0000000000
Binary files a/doc/tutorials/images/ml.jpg and /dev/null differ
diff --git a/doc/tutorials/images/objdetect.jpg b/doc/tutorials/images/objdetect.jpg
deleted file mode 100644
index c811f348f5..0000000000
Binary files a/doc/tutorials/images/objdetect.jpg and /dev/null differ
diff --git a/doc/tutorials/images/opencv_ios.png b/doc/tutorials/images/opencv_ios.png
deleted file mode 100644
index ce2031d7c0..0000000000
Binary files a/doc/tutorials/images/opencv_ios.png and /dev/null differ
diff --git a/doc/tutorials/images/photo.png b/doc/tutorials/images/photo.png
deleted file mode 100644
index f701ffacf1..0000000000
Binary files a/doc/tutorials/images/photo.png and /dev/null differ
diff --git a/doc/tutorials/images/retina.jpg b/doc/tutorials/images/retina.jpg
deleted file mode 100644
index 2d2465070f..0000000000
Binary files a/doc/tutorials/images/retina.jpg and /dev/null differ
diff --git a/doc/tutorials/images/video.jpg b/doc/tutorials/images/video.jpg
deleted file mode 100644
index dd5d0c4ed2..0000000000
Binary files a/doc/tutorials/images/video.jpg and /dev/null differ
diff --git a/doc/tutorials/images/viz.jpg b/doc/tutorials/images/viz.jpg
deleted file mode 100644
index 7ac8f3ed8d..0000000000
Binary files a/doc/tutorials/images/viz.jpg and /dev/null differ
diff --git a/doc/tutorials/imgcodecs/images/gdal-io.jpg b/doc/tutorials/imgcodecs/images/gdal-io.jpg
deleted file mode 100644
index b2974ed2fb..0000000000
Binary files a/doc/tutorials/imgcodecs/images/gdal-io.jpg and /dev/null differ
diff --git a/doc/tutorials/imgcodecs/table_of_content_highgui.markdown b/doc/tutorials/imgcodecs/table_of_content_highgui.markdown
deleted file mode 100644
index b63b7b00ce..0000000000
--- a/doc/tutorials/imgcodecs/table_of_content_highgui.markdown
+++ /dev/null
@@ -1,14 +0,0 @@
-Image Input and Output (imgcodecs module) {#tutorial_table_of_content_imgcodecs}
-=========================================
-
-This section contains tutorials about how to read/save your image files.
-
--   @subpage tutorial_raster_io_gdal
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Marvin Smith
-
-    Read common GIS Raster and DEM files to display and manipulate geographic data.
diff --git a/doc/tutorials/imgproc/anisotropic_image_segmentation/anisotropic_image_segmentation.markdown b/doc/tutorials/imgproc/anisotropic_image_segmentation/anisotropic_image_segmentation.markdown
index 49fd621909..d480a19621 100644
--- a/doc/tutorials/imgproc/anisotropic_image_segmentation/anisotropic_image_segmentation.markdown
+++ b/doc/tutorials/imgproc/anisotropic_image_segmentation/anisotropic_image_segmentation.markdown
@@ -1,9 +1,16 @@
 Anisotropic image segmentation by a gradient structure tensor {#tutorial_anisotropic_image_segmentation_by_a_gst}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_motion_deblur_filter}
 @next_tutorial{tutorial_periodic_noise_removing_filter}
 
+|    |    |
+| -: | :- |
+| Original author | Karpushin Vladislav |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/basic_geometric_drawing/basic_geometric_drawing.markdown b/doc/tutorials/imgproc/basic_geometric_drawing/basic_geometric_drawing.markdown
index 77c44219f9..289000cbe2 100644
--- a/doc/tutorials/imgproc/basic_geometric_drawing/basic_geometric_drawing.markdown
+++ b/doc/tutorials/imgproc/basic_geometric_drawing/basic_geometric_drawing.markdown
@@ -1,8 +1,15 @@
 Basic Drawing {#tutorial_basic_geometric_drawing}
 =============
 
+@tableofcontents
+
 @next_tutorial{tutorial_random_generator_and_text}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goals
 -----
 
diff --git a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
index 42f8c7c38f..99179f7691 100644
--- a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
+++ b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
@@ -1,9 +1,16 @@
 Eroding and Dilating {#tutorial_erosion_dilatation}
 ====================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_gausian_median_blur_bilateral_filter}
 @next_tutorial{tutorial_opening_closing_hats}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown b/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
index a03f95b6e4..beb09641c1 100644
--- a/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
+++ b/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
@@ -1,9 +1,16 @@
 Smoothing Images {#tutorial_gausian_median_blur_bilateral_filter}
 ================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_random_generator_and_text}
 @next_tutorial{tutorial_erosion_dilatation}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/histograms/back_projection/back_projection.markdown b/doc/tutorials/imgproc/histograms/back_projection/back_projection.markdown
index 61baca9bf1..4bc5f35012 100644
--- a/doc/tutorials/imgproc/histograms/back_projection/back_projection.markdown
+++ b/doc/tutorials/imgproc/histograms/back_projection/back_projection.markdown
@@ -1,9 +1,16 @@
 Back Projection {#tutorial_back_projection}
 ===============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_histogram_comparison}
 @next_tutorial{tutorial_template_matching}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.markdown b/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.markdown
index 0623ba12f2..5f16931269 100644
--- a/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.markdown
+++ b/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.markdown
@@ -1,9 +1,16 @@
 Histogram Calculation {#tutorial_histogram_calculation}
 =====================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_histogram_equalization}
 @next_tutorial{tutorial_histogram_comparison}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.markdown b/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.markdown
index 8b7bf78377..cc38cba07a 100644
--- a/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.markdown
+++ b/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.markdown
@@ -1,9 +1,16 @@
 Histogram Comparison {#tutorial_histogram_comparison}
 ====================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_histogram_calculation}
 @next_tutorial{tutorial_back_projection}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/histograms/histogram_equalization/histogram_equalization.markdown b/doc/tutorials/imgproc/histograms/histogram_equalization/histogram_equalization.markdown
index 271c6d1347..e80032ba7c 100644
--- a/doc/tutorials/imgproc/histograms/histogram_equalization/histogram_equalization.markdown
+++ b/doc/tutorials/imgproc/histograms/histogram_equalization/histogram_equalization.markdown
@@ -1,9 +1,16 @@
 Histogram Equalization {#tutorial_histogram_equalization}
 ======================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_warp_affine}
 @next_tutorial{tutorial_histogram_calculation}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown b/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown
index 5cc39e3b17..f03f761dd1 100644
--- a/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown
+++ b/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown
@@ -1,9 +1,16 @@
 Template Matching {#tutorial_template_matching}
 =================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_back_projection}
 @next_tutorial{tutorial_find_contours}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/hitOrMiss/hitOrMiss.markdown b/doc/tutorials/imgproc/hitOrMiss/hitOrMiss.markdown
index c55f09296f..887b3765c0 100644
--- a/doc/tutorials/imgproc/hitOrMiss/hitOrMiss.markdown
+++ b/doc/tutorials/imgproc/hitOrMiss/hitOrMiss.markdown
@@ -1,9 +1,16 @@
 Hit-or-Miss {#tutorial_hitOrMiss}
 =================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_opening_closing_hats}
 @next_tutorial{tutorial_morph_lines_detection}
 
+|    |    |
+| -: | :- |
+| Original author | Lorena García |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/images/Morphology_1_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Morphology_1_Tutorial_Cover.jpg
deleted file mode 100644
index 67da3a5ac0..0000000000
Binary files a/doc/tutorials/imgproc/images/Morphology_1_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/Morphology_2_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Morphology_2_Tutorial_Cover.jpg
deleted file mode 100644
index b3a1c55659..0000000000
Binary files a/doc/tutorials/imgproc/images/Morphology_2_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/Morphology_3_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Morphology_3_Tutorial_Cover.jpg
deleted file mode 100644
index 1eddc17554..0000000000
Binary files a/doc/tutorials/imgproc/images/Morphology_3_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/Pyramids_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Pyramids_Tutorial_Cover.jpg
deleted file mode 100644
index 0851cab278..0000000000
Binary files a/doc/tutorials/imgproc/images/Pyramids_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/Smoothing_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Smoothing_Tutorial_Cover.jpg
deleted file mode 100644
index 67656ab4b3..0000000000
Binary files a/doc/tutorials/imgproc/images/Smoothing_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/Threshold_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/Threshold_Tutorial_Cover.jpg
deleted file mode 100644
index 6b115d88f5..0000000000
Binary files a/doc/tutorials/imgproc/images/Threshold_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/histograms/Back_Projection_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/histograms/Back_Projection_Tutorial_Cover.jpg
deleted file mode 100644
index 013bdf6f2f..0000000000
Binary files a/doc/tutorials/imgproc/images/histograms/Back_Projection_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/histograms/Histogram_Calculation_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/histograms/Histogram_Calculation_Tutorial_Cover.jpg
deleted file mode 100644
index 32d09e37e8..0000000000
Binary files a/doc/tutorials/imgproc/images/histograms/Histogram_Calculation_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/histograms/Histogram_Comparison_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/histograms/Histogram_Comparison_Tutorial_Cover.jpg
deleted file mode 100644
index 7538a7203d..0000000000
Binary files a/doc/tutorials/imgproc/images/histograms/Histogram_Comparison_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/histograms/Histogram_Equalization_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/histograms/Histogram_Equalization_Tutorial_Cover.jpg
deleted file mode 100644
index fbc5866835..0000000000
Binary files a/doc/tutorials/imgproc/images/histograms/Histogram_Equalization_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/histograms/Template_Matching_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/histograms/Template_Matching_Tutorial_Cover.jpg
deleted file mode 100644
index e84f52119a..0000000000
Binary files a/doc/tutorials/imgproc/images/histograms/Template_Matching_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Canny_Detector_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Canny_Detector_Tutorial_Cover.jpg
deleted file mode 100644
index bcd9ff9ace..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Canny_Detector_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/CopyMakeBorder_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/CopyMakeBorder_Tutorial_Cover.jpg
deleted file mode 100644
index f241ff2238..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/CopyMakeBorder_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Distance_Transformation_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Distance_Transformation_Tutorial_Cover.jpg
deleted file mode 100644
index 8effc42a04..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Distance_Transformation_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Filter_2D_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Filter_2D_Tutorial_Cover.jpg
deleted file mode 100644
index c2f5809108..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Filter_2D_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Hough_Circle_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Hough_Circle_Tutorial_Cover.jpg
deleted file mode 100644
index 175180ae81..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Hough_Circle_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Hough_Lines_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Hough_Lines_Tutorial_Cover.jpg
deleted file mode 100644
index 4211ee2629..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Hough_Lines_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Laplace_Operator_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Laplace_Operator_Tutorial_Cover.jpg
deleted file mode 100644
index 14373f25b1..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Laplace_Operator_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Remap_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Remap_Tutorial_Cover.jpg
deleted file mode 100644
index bfb55dbdac..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Remap_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Sobel_Derivatives_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Sobel_Derivatives_Tutorial_Cover.jpg
deleted file mode 100644
index fbe17c8978..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Sobel_Derivatives_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/imgtrans/Warp_Affine_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/imgtrans/Warp_Affine_Tutorial_Cover.jpg
deleted file mode 100644
index 5655789bd6..0000000000
Binary files a/doc/tutorials/imgproc/images/imgtrans/Warp_Affine_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rects_Circles_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rects_Circles_Tutorial_Cover.jpg
deleted file mode 100644
index be2ae57d40..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rects_Circles_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rotated_Ellipses_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rotated_Ellipses_Tutorial_Cover.jpg
deleted file mode 100644
index b7330592b5..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Bounding_Rotated_Ellipses_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Find_Contours_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Find_Contours_Tutorial_Cover.jpg
deleted file mode 100644
index 82888a1ba9..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Find_Contours_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Hull_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Hull_Tutorial_Cover.jpg
deleted file mode 100644
index a7a1b6ebeb..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Hull_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Moments_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Moments_Tutorial_Cover.jpg
deleted file mode 100644
index 1e865eb371..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Moments_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/images/shapedescriptors/Point_Polygon_Test_Tutorial_Cover.jpg b/doc/tutorials/imgproc/images/shapedescriptors/Point_Polygon_Test_Tutorial_Cover.jpg
deleted file mode 100644
index 9980df8421..0000000000
Binary files a/doc/tutorials/imgproc/images/shapedescriptors/Point_Polygon_Test_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/imgproc/imgtrans/canny_detector/canny_detector.markdown b/doc/tutorials/imgproc/imgtrans/canny_detector/canny_detector.markdown
index 01bf6f862d..d585c77520 100644
--- a/doc/tutorials/imgproc/imgtrans/canny_detector/canny_detector.markdown
+++ b/doc/tutorials/imgproc/imgtrans/canny_detector/canny_detector.markdown
@@ -1,9 +1,16 @@
 Canny Edge Detector {#tutorial_canny_detector}
 ===================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_laplace_operator}
 @next_tutorial{tutorial_hough_lines}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown b/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
index 8a4bbc0702..4acc06064f 100644
--- a/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
+++ b/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
@@ -1,9 +1,16 @@
 Adding borders to your images {#tutorial_copyMakeBorder}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_filter_2d}
 @next_tutorial{tutorial_sobel_derivatives}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/distance_transformation/distance_transform.markdown b/doc/tutorials/imgproc/imgtrans/distance_transformation/distance_transform.markdown
index a5afffdbb1..28b81f4e6e 100644
--- a/doc/tutorials/imgproc/imgtrans/distance_transformation/distance_transform.markdown
+++ b/doc/tutorials/imgproc/imgtrans/distance_transformation/distance_transform.markdown
@@ -1,9 +1,16 @@
 Image Segmentation with Distance Transform and Watershed Algorithm {#tutorial_distance_transform}
 =============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_point_polygon_test}
 @next_tutorial{tutorial_out_of_focus_deblur_filter}
 
+|    |    |
+| -: | :- |
+| Original author | Theodore Tsesmelis |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown b/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
index 454f745177..efe3fdf9ae 100644
--- a/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
+++ b/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
@@ -1,9 +1,16 @@
 Making your own linear filters! {#tutorial_filter_2d}
 ===============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_threshold_inRange}
 @next_tutorial{tutorial_copyMakeBorder}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
index fe2f88be15..6b2f215901 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
@@ -1,9 +1,16 @@
 Hough Circle Transform {#tutorial_hough_circle}
 ======================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_hough_lines}
 @next_tutorial{tutorial_remap}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
index 8b24d87a2d..5edff16879 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@@ -1,9 +1,16 @@
 Hough Line Transform {#tutorial_hough_lines}
 ====================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_canny_detector}
 @next_tutorial{tutorial_hough_circle}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
@@ -217,7 +224,7 @@ First you apply the transform:
         -   *theta*: The resolution of the parameter \f$\theta\f$ in radians. We use **1 degree**
             (CV_PI/180)
         -   *threshold*: The minimum number of intersections to "*detect*" a line
-        -   *minLinLength*: The minimum number of points that can form a line. Lines with less than
+        -   *minLineLength*: The minimum number of points that can form a line. Lines with less than
             this number of points are disregarded.
         -   *maxLineGap*: The maximum gap between two points to be considered in the same line.
 
diff --git a/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown b/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
index 63aed356b2..27b4aa98db 100644
--- a/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
+++ b/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
@@ -1,9 +1,16 @@
 Laplace Operator {#tutorial_laplace_operator}
 ================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_sobel_derivatives}
 @next_tutorial{tutorial_canny_detector}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/remap/remap.markdown b/doc/tutorials/imgproc/imgtrans/remap/remap.markdown
index 58c79c6039..3c3d95c4cc 100644
--- a/doc/tutorials/imgproc/imgtrans/remap/remap.markdown
+++ b/doc/tutorials/imgproc/imgtrans/remap/remap.markdown
@@ -1,9 +1,16 @@
 Remapping {#tutorial_remap}
 =========
 
+@tableofcontents
+
 @prev_tutorial{tutorial_hough_circle}
 @next_tutorial{tutorial_warp_affine}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown b/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
index f8725d2a12..4183476524 100644
--- a/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
+++ b/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
@@ -1,9 +1,16 @@
 Sobel Derivatives {#tutorial_sobel_derivatives}
 =================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_copyMakeBorder}
 @next_tutorial{tutorial_laplace_operator}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/imgtrans/warp_affine/warp_affine.markdown b/doc/tutorials/imgproc/imgtrans/warp_affine/warp_affine.markdown
index b5023ad03e..22d5298f18 100644
--- a/doc/tutorials/imgproc/imgtrans/warp_affine/warp_affine.markdown
+++ b/doc/tutorials/imgproc/imgtrans/warp_affine/warp_affine.markdown
@@ -1,9 +1,16 @@
 Affine Transformations {#tutorial_warp_affine}
 ======================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_remap}
 @next_tutorial{tutorial_histogram_equalization}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
index ce9e81e211..74b117f849 100644
--- a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
+++ b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
@@ -1,9 +1,16 @@
 Extract horizontal and vertical lines by using morphological operations {#tutorial_morph_lines_detection}
 =============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_hitOrMiss}
 @next_tutorial{tutorial_pyramids}
 
+|    |    |
+| -: | :- |
+| Original author | Theodore Tsesmelis |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown b/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown
index 704e0ef275..fd286deda5 100644
--- a/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown
@@ -1,9 +1,16 @@
 Motion Deblur Filter {#tutorial_motion_deblur_filter}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_out_of_focus_deblur_filter}
 @next_tutorial{tutorial_anisotropic_image_segmentation_by_a_gst}
 
+|    |    |
+| -: | :- |
+| Original author | Karpushin Vladislav |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown b/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
index e918c65ce7..8f991176d9 100644
--- a/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
+++ b/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
@@ -1,9 +1,16 @@
 More Morphology Transformations {#tutorial_opening_closing_hats}
 ===============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_erosion_dilatation}
 @next_tutorial{tutorial_hitOrMiss}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
index 800286d9a8..13db710b32 100644
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
@@ -1,9 +1,16 @@
 Out-of-focus Deblur Filter {#tutorial_out_of_focus_deblur_filter}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_distance_transform}
 @next_tutorial{tutorial_motion_deblur_filter}
 
+|    |    |
+| -: | :- |
+| Original author | Karpushin Vladislav |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/periodic_noise_removing_filter/periodic_noise_removing_filter.markdown b/doc/tutorials/imgproc/periodic_noise_removing_filter/periodic_noise_removing_filter.markdown
index 3c36a1e9c4..dff204a2f2 100644
--- a/doc/tutorials/imgproc/periodic_noise_removing_filter/periodic_noise_removing_filter.markdown
+++ b/doc/tutorials/imgproc/periodic_noise_removing_filter/periodic_noise_removing_filter.markdown
@@ -1,8 +1,15 @@
 Periodic Noise Removing Filter {#tutorial_periodic_noise_removing_filter}
 ==========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_anisotropic_image_segmentation_by_a_gst}
 
+|    |    |
+| -: | :- |
+| Original author | Karpushin Vladislav |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/pyramids/pyramids.markdown b/doc/tutorials/imgproc/pyramids/pyramids.markdown
index c11a80527f..82ef81e369 100644
--- a/doc/tutorials/imgproc/pyramids/pyramids.markdown
+++ b/doc/tutorials/imgproc/pyramids/pyramids.markdown
@@ -1,9 +1,16 @@
 Image Pyramids {#tutorial_pyramids}
 ==============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_morph_lines_detection}
 @next_tutorial{tutorial_threshold}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
@@ -163,7 +170,7 @@ Our program exits if the user presses **ESC**. Besides, it has two options:
     We use the function **pyrDown()** with three arguments (similarly to **pyrUp()**):
             -   *src*: The current and destination image  (to be shown on screen, supposedly half the input
                 image)
-            -   *Size( tmp.cols/2, tmp.rows/2 )* : The destination size. Since we are upsampling,
+            -   *Size( tmp.cols/2, tmp.rows/2 )* : The destination size. Since we are downsampling,
                 **pyrDown()** expects half the size the input image (in this case *src*).
 
 @add_toggle_cpp
diff --git a/doc/tutorials/imgproc/random_generator_and_text/random_generator_and_text.markdown b/doc/tutorials/imgproc/random_generator_and_text/random_generator_and_text.markdown
index f588bbc44d..4100d91622 100644
--- a/doc/tutorials/imgproc/random_generator_and_text/random_generator_and_text.markdown
+++ b/doc/tutorials/imgproc/random_generator_and_text/random_generator_and_text.markdown
@@ -1,9 +1,16 @@
 Random generator and text with OpenCV {#tutorial_random_generator_and_text}
 =====================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_basic_geometric_drawing}
 @next_tutorial{tutorial_gausian_median_blur_bilateral_filter}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goals
 -----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown b/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
index d6194dfd3f..2c6d59b579 100644
--- a/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
@@ -1,9 +1,16 @@
 Creating Bounding boxes and circles for contours {#tutorial_bounding_rects_circles}
 ================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_hull}
 @next_tutorial{tutorial_bounding_rotated_ellipses}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown b/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
index a4c29b2fde..c15d73e291 100644
--- a/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
@@ -1,9 +1,16 @@
 Creating Bounding rotated boxes and ellipses for contours {#tutorial_bounding_rotated_ellipses}
 =========================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_bounding_rects_circles}
 @next_tutorial{tutorial_moments}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown b/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
index b8aa6d898f..dc112b9402 100644
--- a/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
@@ -1,9 +1,16 @@
 Finding contours in your image {#tutorial_find_contours}
 ==============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_template_matching}
 @next_tutorial{tutorial_hull}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown b/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
index e40934e6e2..36763fd81d 100644
--- a/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
@@ -1,9 +1,16 @@
 Convex Hull {#tutorial_hull}
 ===========
 
+@tableofcontents
+
 @prev_tutorial{tutorial_find_contours}
 @next_tutorial{tutorial_bounding_rects_circles}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown b/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
index 683568ab0c..4e47242cd1 100644
--- a/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
@@ -1,9 +1,16 @@
 Image Moments {#tutorial_moments}
 =============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_bounding_rotated_ellipses}
 @next_tutorial{tutorial_point_polygon_test}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown b/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
index 2e02fb8815..6f4c6d7ebd 100644
--- a/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
@@ -1,9 +1,16 @@
 Point Polygon Test {#tutorial_point_polygon_test}
 ==================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_moments}
 @next_tutorial{tutorial_distance_transform}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/table_of_content_imgproc.markdown b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
index b0a8b8260b..edffd706bd 100644
--- a/doc/tutorials/imgproc/table_of_content_imgproc.markdown
+++ b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
@@ -1,298 +1,52 @@
 Image Processing (imgproc module) {#tutorial_table_of_content_imgproc}
 =================================
 
-In this section you will learn about the image processing (manipulation) functions inside OpenCV.
-
+Basic
+-----
 -   @subpage tutorial_basic_geometric_drawing
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    We will learn how to draw simple geometry with OpenCV!
-
 -   @subpage tutorial_random_generator_and_text
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    We will draw some *fancy-looking* stuff using OpenCV!
-
 -   @subpage tutorial_gausian_median_blur_bilateral_filter
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Let's take a look at some basic linear filters!
-
 -   @subpage tutorial_erosion_dilatation
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    Author: Ana Huamán
-
-    Let's *change* the shape of objects!
-
 -   @subpage tutorial_opening_closing_hats
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Here we investigate different morphology operators
-
 -   @subpage tutorial_hitOrMiss
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.4
-
-    *Author:* Lorena García
-
-    Learn how to find patterns in binary images using the Hit-or-Miss operation
-
 -   @subpage tutorial_morph_lines_detection
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Theodore Tsesmelis
-
-    Here we will show how we can use different morphological operators to extract horizontal and vertical lines
-
 -   @subpage tutorial_pyramids
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    What if I need a bigger/smaller image?
-
 -   @subpage tutorial_threshold
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    After so much processing, it is time to decide which pixels stay
-
 -   @subpage tutorial_threshold_inRange
 
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Rishiraj Surti
-
-    Thresholding operations using inRange function.
-
+Transformations
+---------------
 -   @subpage tutorial_filter_2d
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn to design our own filters by using OpenCV functions
-
 -   @subpage tutorial_copyMakeBorder
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to pad our images
-
 -   @subpage tutorial_sobel_derivatives
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to calculate gradients and use them to detect edges
-
 -   @subpage tutorial_laplace_operator
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn about the *Laplace* operator and how to detect edges with it
-
 -   @subpage tutorial_canny_detector
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn a sophisticated alternative to detect edges
-
 -   @subpage tutorial_hough_lines
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to detect lines
-
 -   @subpage tutorial_hough_circle
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to detect circles
-
 -   @subpage tutorial_remap
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to manipulate pixels locations
-
 -   @subpage tutorial_warp_affine
 
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to rotate, translate and scale our images
-
+Histograms
+----------
 -   @subpage tutorial_histogram_equalization
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to improve the contrast in our images
-
 -   @subpage tutorial_histogram_calculation
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to create and generate histograms
-
 -   @subpage tutorial_histogram_comparison
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn to calculate metrics between histograms
-
 -   @subpage tutorial_back_projection
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to use histograms to find similar objects in images
-
 -   @subpage tutorial_template_matching
 
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to match templates in an image
-
--   @subpage tutorial_table_of_contents_contours
-
-    Learn how to find contours in images and investigate their properties and features.
+Contours
+--------
+-   @subpage tutorial_find_contours
+-   @subpage tutorial_hull
+-   @subpage tutorial_bounding_rects_circles
+-   @subpage tutorial_bounding_rotated_ellipses
+-   @subpage tutorial_moments
+-   @subpage tutorial_point_polygon_test
 
+Others
+------
 -   @subpage tutorial_distance_transform
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Theodore Tsesmelis
-
-    Where we learn to segment objects using Laplacian filtering, the Distance Transformation and the Watershed algorithm.
-
 -   @subpage tutorial_out_of_focus_deblur_filter
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Karpushin Vladislav
-
-    You will learn how to recover an out-of-focus image by Wiener filter.
-
 -   @subpage tutorial_motion_deblur_filter
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Karpushin Vladislav
-
-    You will learn how to recover an image with motion blur distortion using a Wiener filter.
-
 -   @subpage tutorial_anisotropic_image_segmentation_by_a_gst
-
-    *Languages:* C++, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Karpushin Vladislav
-
-    You will learn how to segment an anisotropic image with a single local orientation by a gradient structure tensor.
-
 -   @subpage tutorial_periodic_noise_removing_filter
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Karpushin Vladislav
-
-    You will learn how to remove periodic noise in the Fourier domain.
diff --git a/doc/tutorials/imgproc/table_of_contents_contours.markdown b/doc/tutorials/imgproc/table_of_contents_contours.markdown
index 3e8bba3a62..cc2f133bfd 100644
--- a/doc/tutorials/imgproc/table_of_contents_contours.markdown
+++ b/doc/tutorials/imgproc/table_of_contents_contours.markdown
@@ -1,62 +1,4 @@
 Contours in OpenCV {#tutorial_table_of_contents_contours}
 ==================
 
--   @subpage tutorial_find_contours
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to find contours of objects in our image
-
--    @subpage tutorial_hull
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to get hull contours and draw them
-
--   @subpage tutorial_bounding_rects_circles
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to obtain bounding boxes and circles for our contours
-
--   @subpage tutorial_bounding_rotated_ellipses
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to obtain rotated bounding boxes and ellipses for our contours
-
--   @subpage tutorial_moments
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn to calculate the moments of an image
-
--   @subpage tutorial_point_polygon_test
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Where we learn how to calculate distances from the image to contours
+Content has been moved to this page: @ref tutorial_table_of_content_imgproc
diff --git a/doc/tutorials/imgproc/threshold/threshold.markdown b/doc/tutorials/imgproc/threshold/threshold.markdown
index a452d14042..f7458d192c 100644
--- a/doc/tutorials/imgproc/threshold/threshold.markdown
+++ b/doc/tutorials/imgproc/threshold/threshold.markdown
@@ -1,9 +1,16 @@
 Basic Thresholding Operations {#tutorial_threshold}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_pyramids}
 @next_tutorial{tutorial_threshold_inRange}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/imgproc/threshold_inRange/threshold_inRange.markdown b/doc/tutorials/imgproc/threshold_inRange/threshold_inRange.markdown
index 0995b9758c..f05ebe477a 100644
--- a/doc/tutorials/imgproc/threshold_inRange/threshold_inRange.markdown
+++ b/doc/tutorials/imgproc/threshold_inRange/threshold_inRange.markdown
@@ -1,9 +1,16 @@
 Thresholding Operations using inRange {#tutorial_threshold_inRange}
 =====================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_threshold}
 @next_tutorial{tutorial_filter_2d}
 
+|    |    |
+| -: | :- |
+| Original author | Lorena García |
+| Compatibility | Rishiraj Surti |
+
 Goal
 ----
 
diff --git a/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown b/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown
index bfc5842441..938dd613ed 100644
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown
@@ -28,7 +28,7 @@ If you need help with anything of the above, you may refer to our @ref tutorial_
 
 If you encounter any error after thoroughly following these steps, feel free to contact us via
 [OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](http://answers.opencv.org). We'll do our best to help you out.
+forum](https://forum.opencv.org). We'll do our best to help you out.
 
 General info
 ------------
diff --git a/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown b/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
index 47dc07be43..584f9a2f6c 100644
--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
@@ -19,7 +19,7 @@ working environment quickly. It was written with Windows 7 in mind, though it wo
 
 If you encounter any error after thoroughly following these steps, feel free to contact us via
 [OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](http://answers.opencv.org). We'll do our best to help you out.
+forum](https://forum.opencv.org). We'll do our best to help you out.
 
 Preface
 -------
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
index 92dd77af67..5acdbc41ed 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
@@ -32,7 +32,7 @@ may consult our @ref tutorial_O4A_SDK tutorial.
 
 If you encounter any error after thoroughly following these steps, feel free to contact us via
 [OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](http://answers.opencv.org) . We'll do our best to help you out.
+forum](https://forum.opencv.org) . We'll do our best to help you out.
 
 Using OpenCV Library Within Your Android Project
 ------------------------------------------------
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 1cabf23200..1d4f426c8f 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -1,6 +1,9 @@
 OpenCV configuration options reference {#tutorial_config_reference}
 ======================================
 
+@prev_tutorial{tutorial_general_install}
+@next_tutorial{tutorial_linux_install}
+
 @tableofcontents
 
 # Introduction {#tutorial_config_reference_intro}
@@ -411,9 +414,9 @@ Some of OpenCV algorithms can use multithreading to accelerate processing. OpenC
 | pthreads | `WITH_PTHREADS_PF` | _ON_ | Unix-like | Default backend based on [pthreads](https://en.wikipedia.org/wiki/POSIX_Threads) library is available on Linux, Android and other Unix-like platforms. Thread pool is implemented in OpenCV and can be controlled with environment variables `OPENCV_THREAD_POOL_*`. Please check sources in _modules/core/src/parallel_impl.cpp_ file for details. |
 | Concurrency | N/A | _ON_ | Windows | [Concurrency runtime](https://docs.microsoft.com/en-us/cpp/parallel/concrt/concurrency-runtime) is available on Windows and will be turned _ON_ on supported platforms unless other backend is enabled. |
 | GCD | N/A | _ON_ | Apple | [Grand Central Dispatch](https://en.wikipedia.org/wiki/Grand_Central_Dispatch) is available on Apple platforms and will be turned _ON_ automatically unless other backend is enabled. Uses global system thread pool. |
-| TBB | `WITH_TBB` | Multiple | _OFF_ | [Threading Building Blocks](https://en.wikipedia.org/wiki/Threading_Building_Blocks) is a cross-platform library for parallel programming. |
-| OpenMP | `WITH_OPENMP` | Multiple | _OFF_ | [OpenMP](https://en.wikipedia.org/wiki/OpenMP) API relies on compiler support. |
-| HPX | `WITH_HPX` | Multiple | _OFF_ | [High Performance ParallelX](https://en.wikipedia.org/wiki/HPX) is an experimental backend which is more suitable for multiprocessor environments. |
+| TBB | `WITH_TBB` | _OFF_ | Multiple | [Threading Building Blocks](https://en.wikipedia.org/wiki/Threading_Building_Blocks) is a cross-platform library for parallel programming. |
+| OpenMP | `WITH_OPENMP` | _OFF_ | Multiple | [OpenMP](https://en.wikipedia.org/wiki/OpenMP) API relies on compiler support. |
+| HPX | `WITH_HPX` | _OFF_ | Multiple | [High Performance ParallelX](https://en.wikipedia.org/wiki/HPX) is an experimental backend which is more suitable for multiprocessor environments. |
 
 @note OpenCV can download and build TBB library from GitHub, this functionality can be enabled with the `BUILD_TBB` option.
 
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
index 91353b2990..058b5c92f2 100644
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
@@ -1,7 +1,7 @@
 Cross compilation for ARM based Linux systems {#tutorial_arm_crosscompile_with_cmake}
 =============================================
 
-@prev_tutorial{tutorial_ios_install}
+@prev_tutorial{tutorial_macos_install}
 @next_tutorial{tutorial_building_tegra_cuda}
 
 |    |    |
diff --git a/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown b/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
index 2cce88c856..3654f939c0 100644
--- a/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
+++ b/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
@@ -667,20 +667,9 @@ Write the tutorial {#tutorial_documentation_steps_tutorial}
 6.  Add newly created tutorial to the corresponding table of contents. Just find
     <em>"table_of_content_*.markdown"</em> file with the needed table and place new record in it
     similar to existing ones.
-    @verbatim
--   @subpage tutorial_windows_visual_studio_image_watch
 
-    _Languages:_ C++, Java, Python
-
-    _Compatibility:_ \>= OpenCV 2.4
-
-    _Author:_ Wolf Kienzle
-
-    You will learn how to visualize OpenCV matrices and images within Visual Studio 2012.
-    @endverbatim
-    As you can see it is just a list item with special _subpage_ command which marks your page as a
-    child and places it into the existing pages hierarchy. Add compatibility information,
-    authors list and short description. Also note the list item indent, empty lines between
+    It is simply a list item with special _subpage_ command which marks your page as a
+    child and places it into the existing pages hierarchy. Also note the list item indent, empty lines between
     paragraphs and special _italic_ markers.
 
 7.  Generate doxygen documentation and verify results.
diff --git a/doc/tutorials/introduction/general_install/general_install.markdown b/doc/tutorials/introduction/general_install/general_install.markdown
index 2fa3a17223..e8c93f430e 100644
--- a/doc/tutorials/introduction/general_install/general_install.markdown
+++ b/doc/tutorials/introduction/general_install/general_install.markdown
@@ -1,6 +1,8 @@
 OpenCV installation overview {#tutorial_general_install}
 ============================
 
+@next_tutorial{tutorial_config_reference}
+
 @tableofcontents
 
 There are two ways of installing OpenCV on your machine: download prebuilt version for your platform or compile from sources.
diff --git a/doc/tutorials/introduction/images/Display_Image_Tutorial_Result.jpg b/doc/tutorials/introduction/images/Display_Image_Tutorial_Result.jpg
deleted file mode 100644
index 16400698f0..0000000000
Binary files a/doc/tutorials/introduction/images/Display_Image_Tutorial_Result.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/images/Java_logo.png b/doc/tutorials/introduction/images/Java_logo.png
deleted file mode 100644
index 2114751896..0000000000
Binary files a/doc/tutorials/introduction/images/Java_logo.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/android_logo.png b/doc/tutorials/introduction/images/android_logo.png
deleted file mode 100644
index 69bccd74d2..0000000000
Binary files a/doc/tutorials/introduction/images/android_logo.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/clojure-logo.png b/doc/tutorials/introduction/images/clojure-logo.png
deleted file mode 100644
index f8a29b965c..0000000000
Binary files a/doc/tutorials/introduction/images/clojure-logo.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/eclipse-logo.png b/doc/tutorials/introduction/images/eclipse-logo.png
deleted file mode 100644
index 64ec01c253..0000000000
Binary files a/doc/tutorials/introduction/images/eclipse-logo.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/eclipse_cpp_logo.jpeg b/doc/tutorials/introduction/images/eclipse_cpp_logo.jpeg
deleted file mode 100644
index e63e26b1b4..0000000000
Binary files a/doc/tutorials/introduction/images/eclipse_cpp_logo.jpeg and /dev/null differ
diff --git a/doc/tutorials/introduction/images/gccegg-65.jpg b/doc/tutorials/introduction/images/gccegg-65.jpg
deleted file mode 100644
index e3e44d1f6c..0000000000
Binary files a/doc/tutorials/introduction/images/gccegg-65.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/images/how_to_write_a_tutorial.png b/doc/tutorials/introduction/images/how_to_write_a_tutorial.png
deleted file mode 100644
index ae40fc3d32..0000000000
Binary files a/doc/tutorials/introduction/images/how_to_write_a_tutorial.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/lena.png b/doc/tutorials/introduction/images/lena.png
deleted file mode 100644
index 68342fae53..0000000000
Binary files a/doc/tutorials/introduction/images/lena.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/opencv_ios.png b/doc/tutorials/introduction/images/opencv_ios.png
deleted file mode 100644
index ce2031d7c0..0000000000
Binary files a/doc/tutorials/introduction/images/opencv_ios.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/ubuntu-logo.jpg b/doc/tutorials/introduction/images/ubuntu-logo.jpg
deleted file mode 100644
index a34243496c..0000000000
Binary files a/doc/tutorials/introduction/images/ubuntu-logo.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/images/visual-studio-2010-logo.jpg b/doc/tutorials/introduction/images/visual-studio-2010-logo.jpg
deleted file mode 100644
index 8b053695c4..0000000000
Binary files a/doc/tutorials/introduction/images/visual-studio-2010-logo.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/images/visual_studio_image_watch.png b/doc/tutorials/introduction/images/visual_studio_image_watch.png
deleted file mode 100644
index e693344df8..0000000000
Binary files a/doc/tutorials/introduction/images/visual_studio_image_watch.png and /dev/null differ
diff --git a/doc/tutorials/introduction/images/windows_logo.jpg b/doc/tutorials/introduction/images/windows_logo.jpg
deleted file mode 100644
index e35a8a86ae..0000000000
Binary files a/doc/tutorials/introduction/images/windows_logo.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/macos_install/macos_install.markdown b/doc/tutorials/introduction/macos_install/macos_install.markdown
index ec708101a0..dadce9304c 100644
--- a/doc/tutorials/introduction/macos_install/macos_install.markdown
+++ b/doc/tutorials/introduction/macos_install/macos_install.markdown
@@ -2,7 +2,7 @@ Installation in MacOS {#tutorial_macos_install}
 =====================
 
 @prev_tutorial{tutorial_android_ocl_intro}
-@next_tutorial{tutorial_ios_install}
+@next_tutorial{tutorial_arm_crosscompile_with_cmake}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/introduction/table_of_content_introduction.markdown b/doc/tutorials/introduction/table_of_content_introduction.markdown
index 2eb95e7b72..d1f2aa3ca3 100644
--- a/doc/tutorials/introduction/table_of_content_introduction.markdown
+++ b/doc/tutorials/introduction/table_of_content_introduction.markdown
@@ -25,9 +25,9 @@ Introduction to OpenCV {#tutorial_table_of_content_introduction}
 
 ##### Other platforms
 -   @subpage tutorial_macos_install
--   @subpage tutorial_ios_install
 -   @subpage tutorial_arm_crosscompile_with_cmake
 -   @subpage tutorial_building_tegra_cuda
+-   @ref tutorial_ios_install
 
 ##### Usage basics
 -   @subpage tutorial_display_image - We will learn how to load an image from file and display it using OpenCV
diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown
index 0eff40dd13..56fe64998c 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@@ -14,7 +14,7 @@ This tutorial can contain obsolete information.
 
 The description here was tested on Windows 7 SP1. Nevertheless, it should also work on any other
 relatively modern version of Windows OS. If you encounter errors after following the steps described
-below, feel free to contact us via our [OpenCV Q&A forum](http://answers.opencv.org). We'll do our
+below, feel free to contact us via our [OpenCV Q&A forum](https://forum.opencv.org). We'll do our
 best to help you out.
 
 @note To use the OpenCV library you have two options: @ref tutorial_windows_install_prebuilt or
@@ -55,12 +55,12 @@ cd /c/lib
 @code{.bash}
 #!/bin/bash -e
 myRepo=$(pwd)
-CMAKE_CONFIG_GENERATOR="Visual Studio 14 2015 Win64"
+CMAKE_GENERATOR_OPTIONS=-G"Visual Studio 16 2019"
+#CMAKE_GENERATOR_OPTIONS=-G"Visual Studio 15 2017 Win64"
+#CMAKE_GENERATOR_OPTIONS=(-G"Visual Studio 16 2019" -A x64)  # CMake 3.14+ is required
 if [  ! -d "$myRepo/opencv"  ]; then
     echo "cloning opencv"
     git clone https://github.com/opencv/opencv.git
-    mkdir -p Build/opencv
-    mkdir -p Install/opencv
 else
     cd opencv
     git pull --rebase
@@ -69,16 +69,17 @@ fi
 if [  ! -d "$myRepo/opencv_contrib"  ]; then
     echo "cloning opencv_contrib"
     git clone https://github.com/opencv/opencv_contrib.git
-    mkdir -p Build/opencv_contrib
 else
     cd opencv_contrib
     git pull --rebase
     cd ..
 fi
 RepoSource=opencv
-pushd Build/$RepoSource
-CMAKE_OPTIONS='-DBUILD_PERF_TESTS:BOOL=OFF -DBUILD_TESTS:BOOL=OFF -DBUILD_DOCS:BOOL=OFF  -DWITH_CUDA:BOOL=OFF -DBUILD_EXAMPLES:BOOL=OFF -DINSTALL_CREATE_DISTRIB=ON'
-cmake -G"$CMAKE_CONFIG_GENERATOR" $CMAKE_OPTIONS -DOPENCV_EXTRA_MODULES_PATH="$myRepo"/opencv_contrib/modules -DCMAKE_INSTALL_PREFIX="$myRepo"/install/"$RepoSource" "$myRepo/$RepoSource"
+mkdir -p build_opencv
+pushd build_opencv
+CMAKE_OPTIONS=(-DBUILD_PERF_TESTS:BOOL=OFF -DBUILD_TESTS:BOOL=OFF -DBUILD_DOCS:BOOL=OFF  -DWITH_CUDA:BOOL=OFF -DBUILD_EXAMPLES:BOOL=OFF -DINSTALL_CREATE_DISTRIB=ON)
+set -x
+cmake "${CMAKE_GENERATOR_OPTIONS[@]}" "${CMAKE_OPTIONS[@]}" -DOPENCV_EXTRA_MODULES_PATH="$myRepo"/opencv_contrib/modules -DCMAKE_INSTALL_PREFIX="$myRepo/install/$RepoSource" "$myRepo/$RepoSource"
 echo "************************* $Source_DIR -->debug"
 cmake --build .  --config debug
 echo "************************* $Source_DIR -->release"
@@ -89,15 +90,15 @@ popd
 @endcode
     In this script I suppose you use VS 2015 in 64 bits
 @code{.bash}
-CMAKE_CONFIG_GENERATOR="Visual Studio 14 2015 Win64"
+CMAKE_GENERATOR_OPTIONS=-G"Visual Studio 14 2015 Win64"
 @endcode
-    and opencv will be installed in c:/lib/install
+    and opencv will be installed in c:/lib/install/opencv
 @code{.bash}
--DCMAKE_INSTALL_PREFIX="$myRepo"/install/"$RepoSource" "$myRepo/$RepoSource"
+-DCMAKE_INSTALL_PREFIX="$myRepo/install/$RepoSource"
 @endcode
     with no Perf tests, no tests, no doc, no CUDA and no example
 @code{.bash}
-CMAKE_OPTIONS='-DBUILD_PERF_TESTS:BOOL=OFF -DBUILD_TESTS:BOOL=OFF -DBUILD_DOCS:BOOL=OFF -DBUILD_EXAMPLES:BOOL=OFF'
+CMAKE_OPTIONS=(-DBUILD_PERF_TESTS:BOOL=OFF -DBUILD_TESTS:BOOL=OFF -DBUILD_DOCS:BOOL=OFF -DBUILD_EXAMPLES:BOOL=OFF)
 @endcode
 -#  In git command line enter following command :
 @code{.bash}
@@ -351,7 +352,7 @@ libraries). If you do not need the support for some of these, you can just freel
 
     To test your build just go into the `Build/bin/Debug` or `Build/bin/Release` directory and start
     a couple of applications like the *contours.exe*. If they run, you are done. Otherwise,
-    something definitely went awfully wrong. In this case you should contact us at our [Q&A forum](http://answers.opencv.org/).
+    something definitely went awfully wrong. In this case you should contact us at our [Q&A forum](https://forum.opencv.org/).
     If everything is okay, the *contours.exe* output should resemble the following image (if
     built with Qt support):
 
diff --git a/doc/tutorials/ios/hello/hello.markdown b/doc/tutorials/ios/hello/hello.markdown
index fc6992cc70..87d39fb8fb 100644
--- a/doc/tutorials/ios/hello/hello.markdown
+++ b/doc/tutorials/ios/hello/hello.markdown
@@ -1,8 +1,16 @@
 OpenCV iOS Hello {#tutorial_hello}
 ================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_ios_install}
 @next_tutorial{tutorial_image_manipulation}
 
+|    |    |
+| -: | :- |
+| Original author | Charu Hans |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/ios/image_manipulation/image_manipulation.markdown b/doc/tutorials/ios/image_manipulation/image_manipulation.markdown
index 61590d8f77..f01aa6e4f8 100644
--- a/doc/tutorials/ios/image_manipulation/image_manipulation.markdown
+++ b/doc/tutorials/ios/image_manipulation/image_manipulation.markdown
@@ -1,9 +1,16 @@
 OpenCV iOS - Image Processing {#tutorial_image_manipulation}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_hello}
 @next_tutorial{tutorial_video_processing}
 
+|    |    |
+| -: | :- |
+| Original author | Charu Hans |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/ios/images/facedetect.jpg b/doc/tutorials/ios/images/facedetect.jpg
deleted file mode 100644
index 788b7d8262..0000000000
Binary files a/doc/tutorials/ios/images/facedetect.jpg and /dev/null differ
diff --git a/doc/tutorials/ios/images/image_effects.png b/doc/tutorials/ios/images/image_effects.png
deleted file mode 100644
index 25edb668f9..0000000000
Binary files a/doc/tutorials/ios/images/image_effects.png and /dev/null differ
diff --git a/doc/tutorials/ios/images/intro.png b/doc/tutorials/ios/images/intro.png
deleted file mode 100644
index 5f2dc1aa4c..0000000000
Binary files a/doc/tutorials/ios/images/intro.png and /dev/null differ
diff --git a/doc/tutorials/introduction/ios_install/ios_install.markdown b/doc/tutorials/ios/ios_install/ios_install.markdown
similarity index 96%
rename from doc/tutorials/introduction/ios_install/ios_install.markdown
rename to doc/tutorials/ios/ios_install/ios_install.markdown
index cbe3902602..4353808e14 100644
--- a/doc/tutorials/introduction/ios_install/ios_install.markdown
+++ b/doc/tutorials/ios/ios_install/ios_install.markdown
@@ -1,8 +1,9 @@
 Installation in iOS {#tutorial_ios_install}
 ===================
 
-@prev_tutorial{tutorial_macos_install}
-@next_tutorial{tutorial_arm_crosscompile_with_cmake}
+@tableofcontents
+
+@next_tutorial{tutorial_hello}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/ios/table_of_content_ios.markdown b/doc/tutorials/ios/table_of_content_ios.markdown
index 4031c6c80b..99cfea5306 100644
--- a/doc/tutorials/ios/table_of_content_ios.markdown
+++ b/doc/tutorials/ios/table_of_content_ios.markdown
@@ -1,32 +1,6 @@
 OpenCV iOS {#tutorial_table_of_content_ios}
 ==========
-
+-   @subpage tutorial_ios_install
 -   @subpage tutorial_hello
-
-    *Languages:* Objective-C++
-
-    *Compatibility:* \> OpenCV 2.4.3
-
-    *Author:* Charu Hans
-
-    You will learn how to link OpenCV with iOS and write a basic application.
-
 -   @subpage tutorial_image_manipulation
-
-    *Languages:* Objective-C++
-
-    *Compatibility:* \> OpenCV 2.4.3
-
-    *Author:* Charu Hans
-
-    You will learn how to do simple image manipulation using OpenCV in iOS.
-
 -   @subpage tutorial_video_processing
-
-    *Languages:* Objective-C++
-
-    *Compatibility:* \> OpenCV 2.4.3
-
-    *Author:* Eduard Feicho
-
-    You will learn how to capture and process video from camera using OpenCV in iOS.
diff --git a/doc/tutorials/ios/video_processing/video_processing.markdown b/doc/tutorials/ios/video_processing/video_processing.markdown
index 04bdd14e89..e388d8262c 100644
--- a/doc/tutorials/ios/video_processing/video_processing.markdown
+++ b/doc/tutorials/ios/video_processing/video_processing.markdown
@@ -1,8 +1,15 @@
 OpenCV iOS - Video Processing {#tutorial_video_processing}
 =============================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_image_manipulation}
 
+|    |    |
+| -: | :- |
+| Original author | Eduard Feicho |
+| Compatibility | OpenCV >= 3.0 |
+
 
 This tutorial explains how to process video frames using the iPhone's camera and OpenCV.
 
diff --git a/doc/tutorials/ml/images/introduction_to_pca_cover.png b/doc/tutorials/ml/images/introduction_to_pca_cover.png
deleted file mode 100644
index ce230029ec..0000000000
Binary files a/doc/tutorials/ml/images/introduction_to_pca_cover.png and /dev/null differ
diff --git a/doc/tutorials/ml/images/introduction_to_svm.png b/doc/tutorials/ml/images/introduction_to_svm.png
deleted file mode 100644
index f2d63751fc..0000000000
Binary files a/doc/tutorials/ml/images/introduction_to_svm.png and /dev/null differ
diff --git a/doc/tutorials/ml/images/non_linear_svms.png b/doc/tutorials/ml/images/non_linear_svms.png
deleted file mode 100644
index bd185d4c74..0000000000
Binary files a/doc/tutorials/ml/images/non_linear_svms.png and /dev/null differ
diff --git a/doc/tutorials/ml/table_of_content_ml.markdown b/doc/tutorials/ml/table_of_content_ml.markdown
deleted file mode 100644
index b4064777a2..0000000000
--- a/doc/tutorials/ml/table_of_content_ml.markdown
+++ /dev/null
@@ -1,36 +0,0 @@
-Machine Learning (ml module) {#tutorial_table_of_content_ml}
-============================
-
-Use the powerful machine learning classes for statistical classification, regression and clustering
-of data.
-
--   @subpage tutorial_introduction_to_svm
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Fernando Iglesias García
-
-    Learn what a Support Vector Machine is.
-
--   @subpage tutorial_non_linear_svms
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Fernando Iglesias García
-
-    Here you will learn how to define the optimization problem for SVMs when it is not possible to
-    separate linearly the training data.
-
--   @subpage tutorial_introduction_to_pca
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Theodore Tsesmelis
-
-    Learn what a Principal Component Analysis (PCA) is.
diff --git a/doc/tutorials/objdetect/images/Cascade_Classifier_Tutorial_Cover.jpg b/doc/tutorials/objdetect/images/Cascade_Classifier_Tutorial_Cover.jpg
deleted file mode 100644
index cfa5de67e5..0000000000
Binary files a/doc/tutorials/objdetect/images/Cascade_Classifier_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/objdetect/table_of_content_objdetect.markdown b/doc/tutorials/objdetect/table_of_content_objdetect.markdown
deleted file mode 100644
index 0b019d88a5..0000000000
--- a/doc/tutorials/objdetect/table_of_content_objdetect.markdown
+++ /dev/null
@@ -1,18 +0,0 @@
-Object Detection (objdetect module) {#tutorial_table_of_content_objdetect}
-===================================
-
-Ever wondered how your digital camera detects peoples and faces? Look here to find out!
-
--   @subpage tutorial_cascade_classifier
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Ana Huamán
-
-    Here we learn how to use *objdetect* to find objects in our images or videos
-
--   @subpage tutorial_traincascade
-
-    This tutorial describes _opencv_traincascade_ application and its parameters.
diff --git a/doc/tutorials/others/_old/table_of_content_ml.markdown b/doc/tutorials/others/_old/table_of_content_ml.markdown
new file mode 100644
index 0000000000..5999b0208a
--- /dev/null
+++ b/doc/tutorials/others/_old/table_of_content_ml.markdown
@@ -0,0 +1,4 @@
+Machine Learning (ml module) {#tutorial_table_of_content_ml}
+============================
+
+Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/others/_old/table_of_content_objdetect.markdown b/doc/tutorials/others/_old/table_of_content_objdetect.markdown
new file mode 100644
index 0000000000..0aa69fcd8d
--- /dev/null
+++ b/doc/tutorials/others/_old/table_of_content_objdetect.markdown
@@ -0,0 +1,4 @@
+Object Detection (objdetect module) {#tutorial_table_of_content_objdetect}
+===================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/others/_old/table_of_content_photo.markdown b/doc/tutorials/others/_old/table_of_content_photo.markdown
new file mode 100644
index 0000000000..14a10a9c70
--- /dev/null
+++ b/doc/tutorials/others/_old/table_of_content_photo.markdown
@@ -0,0 +1,4 @@
+Computational photography (photo module) {#tutorial_table_of_content_photo}
+========================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/others/_old/table_of_content_stitching.markdown b/doc/tutorials/others/_old/table_of_content_stitching.markdown
new file mode 100644
index 0000000000..e8f91ba659
--- /dev/null
+++ b/doc/tutorials/others/_old/table_of_content_stitching.markdown
@@ -0,0 +1,4 @@
+Images stitching (stitching module) {#tutorial_table_of_content_stitching}
+===================================
+
+Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/others/_old/table_of_content_video.markdown b/doc/tutorials/others/_old/table_of_content_video.markdown
new file mode 100644
index 0000000000..fae3e6ca79
--- /dev/null
+++ b/doc/tutorials/others/_old/table_of_content_video.markdown
@@ -0,0 +1,4 @@
+Video analysis (video module) {#tutorial_table_of_content_video}
+=============================
+
+Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/video/background_subtraction/background_subtraction.markdown b/doc/tutorials/others/background_subtraction.markdown
similarity index 97%
rename from doc/tutorials/video/background_subtraction/background_subtraction.markdown
rename to doc/tutorials/others/background_subtraction.markdown
index 420286960d..5d07f1dae9 100644
--- a/doc/tutorials/video/background_subtraction/background_subtraction.markdown
+++ b/doc/tutorials/others/background_subtraction.markdown
@@ -1,8 +1,16 @@
 How to Use Background Subtraction Methods {#tutorial_background_subtraction}
 =========================================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_stitcher}
 @next_tutorial{tutorial_meanshift}
 
+|    |    |
+| -: | :- |
+| Original author | Domenico Daniele Bloisi |
+| Compatibility | OpenCV >= 3.0 |
+
 -   Background subtraction (BS) is a common and widely used technique for generating a foreground
     mask (namely, a binary image containing the pixels belonging to moving objects in the scene) by
     using static cameras.
diff --git a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown b/doc/tutorials/others/cascade_classifier.markdown
similarity index 98%
rename from doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
rename to doc/tutorials/others/cascade_classifier.markdown
index be942bdbdd..149bac5ab5 100644
--- a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
+++ b/doc/tutorials/others/cascade_classifier.markdown
@@ -1,8 +1,16 @@
 Cascade Classifier {#tutorial_cascade_classifier}
 ==================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_optical_flow}
 @next_tutorial{tutorial_traincascade}
 
+|    |    |
+| -: | :- |
+| Original author | Ana Huamán |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown b/doc/tutorials/others/hdr_imaging.markdown
similarity index 98%
rename from doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown
rename to doc/tutorials/others/hdr_imaging.markdown
index 0bc15fd9b2..a9f1276618 100644
--- a/doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown
+++ b/doc/tutorials/others/hdr_imaging.markdown
@@ -1,6 +1,15 @@
 High Dynamic Range Imaging {#tutorial_hdr_imaging}
 ==========================
 
+@tableofcontents
+
+@next_tutorial{tutorial_stitcher}
+
+|    |    |
+| -: | :- |
+| Original author | Fedor Morozov |
+| Compatibility | OpenCV >= 3.0 |
+
 Introduction
 ------------
 
diff --git a/doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_Scheme.png b/doc/tutorials/others/images/Background_Subtraction_Tutorial_Scheme.png
similarity index 100%
rename from doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_Scheme.png
rename to doc/tutorials/others/images/Background_Subtraction_Tutorial_Scheme.png
diff --git a/doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_frame.jpg b/doc/tutorials/others/images/Background_Subtraction_Tutorial_frame.jpg
similarity index 100%
rename from doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_frame.jpg
rename to doc/tutorials/others/images/Background_Subtraction_Tutorial_frame.jpg
diff --git a/doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_result_KNN.jpg b/doc/tutorials/others/images/Background_Subtraction_Tutorial_result_KNN.jpg
similarity index 100%
rename from doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_result_KNN.jpg
rename to doc/tutorials/others/images/Background_Subtraction_Tutorial_result_KNN.jpg
diff --git a/doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_result_MOG2.jpg b/doc/tutorials/others/images/Background_Subtraction_Tutorial_result_MOG2.jpg
similarity index 100%
rename from doc/tutorials/video/background_subtraction/images/Background_Subtraction_Tutorial_result_MOG2.jpg
rename to doc/tutorials/others/images/Background_Subtraction_Tutorial_result_MOG2.jpg
diff --git a/doc/tutorials/objdetect/cascade_classifier/images/Cascade_Classifier_Tutorial_Result_Haar.jpg b/doc/tutorials/others/images/Cascade_Classifier_Tutorial_Result_Haar.jpg
similarity index 100%
rename from doc/tutorials/objdetect/cascade_classifier/images/Cascade_Classifier_Tutorial_Result_Haar.jpg
rename to doc/tutorials/others/images/Cascade_Classifier_Tutorial_Result_Haar.jpg
diff --git a/doc/tutorials/objdetect/cascade_classifier/images/Cascade_Classifier_Tutorial_Result_LBP.jpg b/doc/tutorials/others/images/Cascade_Classifier_Tutorial_Result_LBP.jpg
similarity index 100%
rename from doc/tutorials/objdetect/cascade_classifier/images/Cascade_Classifier_Tutorial_Result_LBP.jpg
rename to doc/tutorials/others/images/Cascade_Classifier_Tutorial_Result_LBP.jpg
diff --git a/doc/tutorials/stitching/stitcher/images/affinepano.jpg b/doc/tutorials/others/images/affinepano.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/affinepano.jpg
rename to doc/tutorials/others/images/affinepano.jpg
diff --git a/doc/tutorials/stitching/stitcher/images/boat.jpg b/doc/tutorials/others/images/boat.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/boat.jpg
rename to doc/tutorials/others/images/boat.jpg
diff --git a/doc/tutorials/stitching/stitcher/images/budapest.jpg b/doc/tutorials/others/images/budapest.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/budapest.jpg
rename to doc/tutorials/others/images/budapest.jpg
diff --git a/doc/tutorials/stitching/stitcher/images/compressedPlaneA2B1.jpg b/doc/tutorials/others/images/compressedPlaneA2B1.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/compressedPlaneA2B1.jpg
rename to doc/tutorials/others/images/compressedPlaneA2B1.jpg
diff --git a/doc/tutorials/stitching/stitcher/images/fisheye.jpg b/doc/tutorials/others/images/fisheye.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/fisheye.jpg
rename to doc/tutorials/others/images/fisheye.jpg
diff --git a/doc/tutorials/photo/hdr_imaging/images/fusion.png b/doc/tutorials/others/images/fusion.png
similarity index 100%
rename from doc/tutorials/photo/hdr_imaging/images/fusion.png
rename to doc/tutorials/others/images/fusion.png
diff --git a/doc/tutorials/stitching/stitcher/images/gvedit.jpg b/doc/tutorials/others/images/gvedit.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/gvedit.jpg
rename to doc/tutorials/others/images/gvedit.jpg
diff --git a/doc/tutorials/objdetect/cascade_classifier/images/haar.png b/doc/tutorials/others/images/haar.png
similarity index 100%
rename from doc/tutorials/objdetect/cascade_classifier/images/haar.png
rename to doc/tutorials/others/images/haar.png
diff --git a/doc/tutorials/objdetect/cascade_classifier/images/haar_features.jpg b/doc/tutorials/others/images/haar_features.jpg
similarity index 100%
rename from doc/tutorials/objdetect/cascade_classifier/images/haar_features.jpg
rename to doc/tutorials/others/images/haar_features.jpg
diff --git a/doc/tutorials/photo/hdr_imaging/images/ldr.png b/doc/tutorials/others/images/ldr.png
similarity index 100%
rename from doc/tutorials/photo/hdr_imaging/images/ldr.png
rename to doc/tutorials/others/images/ldr.png
diff --git a/doc/tutorials/photo/hdr_imaging/images/memorial.png b/doc/tutorials/others/images/memorial.png
similarity index 100%
rename from doc/tutorials/photo/hdr_imaging/images/memorial.png
rename to doc/tutorials/others/images/memorial.png
diff --git a/doc/tutorials/stitching/stitcher/images/newspaper.jpg b/doc/tutorials/others/images/newspaper.jpg
similarity index 100%
rename from doc/tutorials/stitching/stitcher/images/newspaper.jpg
rename to doc/tutorials/others/images/newspaper.jpg
diff --git a/doc/tutorials/ml/introduction_to_svm/images/optimal-hyperplane.png b/doc/tutorials/others/images/optimal-hyperplane.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_svm/images/optimal-hyperplane.png
rename to doc/tutorials/others/images/optimal-hyperplane.png
diff --git a/doc/tutorials/ml/introduction_to_pca/images/pca_eigen.png b/doc/tutorials/others/images/pca_eigen.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_pca/images/pca_eigen.png
rename to doc/tutorials/others/images/pca_eigen.png
diff --git a/doc/tutorials/ml/introduction_to_pca/images/pca_line.png b/doc/tutorials/others/images/pca_line.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_pca/images/pca_line.png
rename to doc/tutorials/others/images/pca_line.png
diff --git a/doc/tutorials/ml/introduction_to_pca/images/pca_output.png b/doc/tutorials/others/images/pca_output.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_pca/images/pca_output.png
rename to doc/tutorials/others/images/pca_output.png
diff --git a/doc/tutorials/ml/introduction_to_pca/images/pca_test1.jpg b/doc/tutorials/others/images/pca_test1.jpg
similarity index 100%
rename from doc/tutorials/ml/introduction_to_pca/images/pca_test1.jpg
rename to doc/tutorials/others/images/pca_test1.jpg
diff --git a/doc/tutorials/ml/non_linear_svms/images/sample-errors-dist.png b/doc/tutorials/others/images/sample-errors-dist.png
similarity index 100%
rename from doc/tutorials/ml/non_linear_svms/images/sample-errors-dist.png
rename to doc/tutorials/others/images/sample-errors-dist.png
diff --git a/doc/tutorials/ml/introduction_to_svm/images/separating-lines.png b/doc/tutorials/others/images/separating-lines.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_svm/images/separating-lines.png
rename to doc/tutorials/others/images/separating-lines.png
diff --git a/doc/tutorials/ml/introduction_to_svm/images/svm_intro_result.png b/doc/tutorials/others/images/svm_intro_result.png
similarity index 100%
rename from doc/tutorials/ml/introduction_to_svm/images/svm_intro_result.png
rename to doc/tutorials/others/images/svm_intro_result.png
diff --git a/doc/tutorials/ml/non_linear_svms/images/svm_non_linear_result.png b/doc/tutorials/others/images/svm_non_linear_result.png
similarity index 100%
rename from doc/tutorials/ml/non_linear_svms/images/svm_non_linear_result.png
rename to doc/tutorials/others/images/svm_non_linear_result.png
diff --git a/doc/tutorials/objdetect/images/visualisation_single_stage.png b/doc/tutorials/others/images/visualisation_single_stage.png
similarity index 100%
rename from doc/tutorials/objdetect/images/visualisation_single_stage.png
rename to doc/tutorials/others/images/visualisation_single_stage.png
diff --git a/doc/tutorials/objdetect/images/visualisation_video.png b/doc/tutorials/others/images/visualisation_video.png
similarity index 100%
rename from doc/tutorials/objdetect/images/visualisation_video.png
rename to doc/tutorials/others/images/visualisation_video.png
diff --git a/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown b/doc/tutorials/others/introduction_to_pca.markdown
similarity index 98%
rename from doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
rename to doc/tutorials/others/introduction_to_pca.markdown
index c1c6c53a99..490024ac29 100644
--- a/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
+++ b/doc/tutorials/others/introduction_to_pca.markdown
@@ -1,8 +1,15 @@
 Introduction to Principal Component Analysis (PCA) {#tutorial_introduction_to_pca}
 =======================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_non_linear_svms}
 
+|    |    |
+| -: | :- |
+| Original author | Theodore Tsesmelis |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown b/doc/tutorials/others/introduction_to_svm.markdown
similarity index 98%
rename from doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
rename to doc/tutorials/others/introduction_to_svm.markdown
index 1340061228..557093c2ae 100644
--- a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
+++ b/doc/tutorials/others/introduction_to_svm.markdown
@@ -1,8 +1,16 @@
 Introduction to Support Vector Machines {#tutorial_introduction_to_svm}
 =======================================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_traincascade}
 @next_tutorial{tutorial_non_linear_svms}
 
+|    |    |
+| -: | :- |
+| Original author | Fernando Iglesias García |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/video/meanshift/meanshift.markdown b/doc/tutorials/others/meanshift.markdown
similarity index 99%
rename from doc/tutorials/video/meanshift/meanshift.markdown
rename to doc/tutorials/others/meanshift.markdown
index 6472570ead..7ca9ecad30 100644
--- a/doc/tutorials/video/meanshift/meanshift.markdown
+++ b/doc/tutorials/others/meanshift.markdown
@@ -1,6 +1,8 @@
 Meanshift and Camshift {#tutorial_meanshift}
 ======================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_background_subtraction}
 @next_tutorial{tutorial_optical_flow}
 
diff --git a/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown b/doc/tutorials/others/non_linear_svms.markdown
similarity index 99%
rename from doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
rename to doc/tutorials/others/non_linear_svms.markdown
index 025ae0cda1..f70449fe1d 100644
--- a/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
+++ b/doc/tutorials/others/non_linear_svms.markdown
@@ -1,9 +1,16 @@
 Support Vector Machines for Non-Linearly Separable Data {#tutorial_non_linear_svms}
 =======================================================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_introduction_to_svm}
 @next_tutorial{tutorial_introduction_to_pca}
 
+|    |    |
+| -: | :- |
+| Original author | Fernando Iglesias García |
+| Compatibility | OpenCV >= 3.0 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/video/optical_flow/optical_flow.markdown b/doc/tutorials/others/optical_flow.markdown
similarity index 99%
rename from doc/tutorials/video/optical_flow/optical_flow.markdown
rename to doc/tutorials/others/optical_flow.markdown
index bcf88f7af1..07456d7ea9 100644
--- a/doc/tutorials/video/optical_flow/optical_flow.markdown
+++ b/doc/tutorials/others/optical_flow.markdown
@@ -1,7 +1,10 @@
 Optical Flow {#tutorial_optical_flow}
 ============
 
+@tableofcontents
+
 @prev_tutorial{tutorial_meanshift}
+@next_tutorial{tutorial_cascade_classifier}
 
 Goal
 ----
diff --git a/doc/tutorials/stitching/stitcher/stitcher.markdown b/doc/tutorials/others/stitcher.markdown
similarity index 97%
rename from doc/tutorials/stitching/stitcher/stitcher.markdown
rename to doc/tutorials/others/stitcher.markdown
index 3670065bbe..e636d83f30 100644
--- a/doc/tutorials/stitching/stitcher/stitcher.markdown
+++ b/doc/tutorials/others/stitcher.markdown
@@ -1,6 +1,16 @@
 High level stitching API (Stitcher class) {#tutorial_stitcher}
 =========================================
 
+@tableofcontents
+
+@prev_tutorial{tutorial_hdr_imaging}
+@next_tutorial{tutorial_background_subtraction}
+
+|    |    |
+| -: | :- |
+| Original author | Jiri Horner |
+| Compatibility | OpenCV >= 3.2 |
+
 Goal
 ----
 
diff --git a/doc/tutorials/others/table_of_content_other.markdown b/doc/tutorials/others/table_of_content_other.markdown
new file mode 100644
index 0000000000..a004df63e2
--- /dev/null
+++ b/doc/tutorials/others/table_of_content_other.markdown
@@ -0,0 +1,13 @@
+Other tutorials (ml, objdetect, photo, stitching, video) {#tutorial_table_of_content_other}
+========================================================
+
+-   photo. @subpage tutorial_hdr_imaging
+-   stitching. @subpage tutorial_stitcher
+-   video. @subpage tutorial_background_subtraction
+-   video. @subpage tutorial_meanshift
+-   video. @subpage tutorial_optical_flow
+-   objdetect. @subpage tutorial_cascade_classifier
+-   objdetect. @subpage tutorial_traincascade
+-   ml. @subpage tutorial_introduction_to_svm
+-   ml. @subpage tutorial_non_linear_svms
+-   ml. @subpage tutorial_introduction_to_pca
diff --git a/doc/tutorials/objdetect/traincascade.markdown b/doc/tutorials/others/traincascade.markdown
similarity index 99%
rename from doc/tutorials/objdetect/traincascade.markdown
rename to doc/tutorials/others/traincascade.markdown
index 042aaccdc9..0d95bd003a 100644
--- a/doc/tutorials/objdetect/traincascade.markdown
+++ b/doc/tutorials/others/traincascade.markdown
@@ -1,7 +1,10 @@
 Cascade Classifier Training {#tutorial_traincascade}
 ===========================
 
+@tableofcontents
+
 @prev_tutorial{tutorial_cascade_classifier}
+@next_tutorial{tutorial_introduction_to_svm}
 
 Introduction
 ------------
diff --git a/doc/tutorials/photo/images/hdr.png b/doc/tutorials/photo/images/hdr.png
deleted file mode 100644
index 9d3782055c..0000000000
Binary files a/doc/tutorials/photo/images/hdr.png and /dev/null differ
diff --git a/doc/tutorials/photo/table_of_content_photo.markdown b/doc/tutorials/photo/table_of_content_photo.markdown
deleted file mode 100644
index 357c36996e..0000000000
--- a/doc/tutorials/photo/table_of_content_photo.markdown
+++ /dev/null
@@ -1,14 +0,0 @@
-Computational photography (photo module) {#tutorial_table_of_content_photo}
-========================================
-
-Use OpenCV for advanced photo processing.
-
--   @subpage tutorial_hdr_imaging
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 3.0
-
-    *Author:* Fedor Morozov
-
-    Learn how to create and process high dynamic range images.
diff --git a/doc/tutorials/stitching/table_of_content_stitching.markdown b/doc/tutorials/stitching/table_of_content_stitching.markdown
deleted file mode 100644
index d5972f4343..0000000000
--- a/doc/tutorials/stitching/table_of_content_stitching.markdown
+++ /dev/null
@@ -1,17 +0,0 @@
-Images stitching (stitching module) {#tutorial_table_of_content_stitching}
-===================================
-
-Sometimes a single image can't capture it all. Here you will learn how to join
-more images together to create a large pano. Doesn't matter if you want to
-create a photo panorama or you want to stitch scans.
-
--   @subpage tutorial_stitcher
-
-    *Languages:* C++
-
-    *Compatibility:* \>= OpenCV 3.2
-
-    *Author:* Jiri Horner
-
-    You will use high level stitching api to create a photo panorama. You will
-    learn about Stitcher class and its configurations.
diff --git a/doc/tutorials/tutorials.markdown b/doc/tutorials/tutorials.markdown
index 5a35077df9..59aefc2b1f 100644
--- a/doc/tutorials/tutorials.markdown
+++ b/doc/tutorials/tutorials.markdown
@@ -4,18 +4,12 @@ OpenCV Tutorials {#tutorial_root}
 - @subpage tutorial_table_of_content_introduction - build and install OpenCV on your computer
 - @subpage tutorial_table_of_content_core - basic building blocks of the library
 - @subpage tutorial_table_of_content_imgproc - image processing functions
-- @subpage tutorial_table_of_content_highgui - built-in graphical user interface
-- @subpage tutorial_table_of_content_imgcodecs - read and write images from/to files using _imgcodecs_ module
-- @subpage tutorial_table_of_content_videoio - read and write videos using _videio_ module
+- @subpage tutorial_table_of_content_app - application utils (GUI, image/video input/output)
 - @subpage tutorial_table_of_content_calib3d - extract 3D world information from 2D images
 - @subpage tutorial_table_of_content_features2d - feature detectors, descriptors and matching framework
-- @subpage tutorial_table_of_content_video - algorithms for video streams: motion detection, object and feature tracking, etc.
-- @subpage tutorial_table_of_content_objdetect - detect objects using conventional CV methods
 - @subpage tutorial_table_of_content_dnn - infer neural networks using built-in _dnn_ module
-- @subpage tutorial_table_of_content_ml - machine learning algorithms for statistical classification, regression and data clustering
 - @subpage tutorial_table_of_content_gapi - graph-based approach to computer vision algorithms building
-- @subpage tutorial_table_of_content_photo - advanced photo processing
-- @subpage tutorial_table_of_content_stitching - create panoramas and more using _stitching_ module
+- @subpage tutorial_table_of_content_other - other modules (ml, objdetect, stitching, video, photo)
 - @subpage tutorial_table_of_content_ios - running OpenCV on an iDevice
 @cond CUDA_MODULES
 - @subpage tutorial_table_of_content_gpu - utilizing power of video card to run CV algorithms
diff --git a/doc/tutorials/video/images/Background_Subtraction_Tutorial_Cover.jpg b/doc/tutorials/video/images/Background_Subtraction_Tutorial_Cover.jpg
deleted file mode 100644
index d5c84a3722..0000000000
Binary files a/doc/tutorials/video/images/Background_Subtraction_Tutorial_Cover.jpg and /dev/null differ
diff --git a/doc/tutorials/video/table_of_content_video.markdown b/doc/tutorials/video/table_of_content_video.markdown
deleted file mode 100644
index 1a80f716da..0000000000
--- a/doc/tutorials/video/table_of_content_video.markdown
+++ /dev/null
@@ -1,28 +0,0 @@
-Video analysis (video module) {#tutorial_table_of_content_video}
-=============================
-
-Look here in order to find use on your video stream algorithms like: motion extraction, feature
-tracking and foreground extractions.
-
--   @subpage tutorial_background_subtraction
-
-    *Languages:* C++, Java, Python
-
-    *Compatibility:* \> OpenCV 2.4.6
-
-    *Author:* Domenico Daniele Bloisi
-
-    We will learn how to extract foreground masks from both videos and sequences of images and
-    to show them.
-
--   @subpage tutorial_meanshift
-
-    *Languages:* C++, Java, Python
-
-    Learn how to use the Meanshift and Camshift algorithms to track objects in videos.
-
--   @subpage tutorial_optical_flow
-
-    *Languages:* C++, Java, Python
-
-    We will learn how to use optical flow methods to track sparse features or to create a dense representation.
diff --git a/doc/tutorials/videoio/images/video-input-psnr-ssim.png b/doc/tutorials/videoio/images/video-input-psnr-ssim.png
deleted file mode 100644
index de8c2835a6..0000000000
Binary files a/doc/tutorials/videoio/images/video-input-psnr-ssim.png and /dev/null differ
diff --git a/doc/tutorials/videoio/images/video-write.png b/doc/tutorials/videoio/images/video-write.png
deleted file mode 100644
index 9413d80836..0000000000
Binary files a/doc/tutorials/videoio/images/video-write.png and /dev/null differ
diff --git a/doc/tutorials/videoio/table_of_content_videoio.markdown b/doc/tutorials/videoio/table_of_content_videoio.markdown
deleted file mode 100644
index 393a0fc236..0000000000
--- a/doc/tutorials/videoio/table_of_content_videoio.markdown
+++ /dev/null
@@ -1,35 +0,0 @@
-Video Input and Output (videoio module) {#tutorial_table_of_content_videoio}
-=========================================
-
-This section contains tutorials about how to read/save your video files.
-
--   @subpage tutorial_video_input_psnr_ssim
-
-    *Languages:* C++, Python
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
-    You will learn how to read video streams, and how to calculate similarity values such as PSNR
-    or SSIM.
-
--   @subpage tutorial_video_write
-
-    *Languages:* C++
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Bernát Gábor
-
--   @subpage tutorial_kinect_openni
-
-    *Languages:* C++
-
--   @subpage tutorial_orbbec_astra
-
-    *Languages:* C++
-
--   @subpage tutorial_intelperc
-
-    *Languages:* C++
diff --git a/modules/3d/include/opencv2/3d.hpp b/modules/3d/include/opencv2/3d.hpp
index 7591a354fc..6984b705a2 100644
--- a/modules/3d/include/opencv2/3d.hpp
+++ b/modules/3d/include/opencv2/3d.hpp
@@ -51,7 +51,7 @@ respectively) by the same factor.
 
 The joint rotation-translation matrix \f$[R|t]\f$ is the matrix product of a projective
 transformation and a homogeneous transformation. The 3-by-4 projective transformation maps 3D points
-represented in camera coordinates to 2D poins in the image plane and represented in normalized
+represented in camera coordinates to 2D points in the image plane and represented in normalized
 camera coordinates \f$x' = X_c / Z_c\f$ and \f$y' = Y_c / Z_c\f$:
 
 \f[Z_c \begin{bmatrix}
@@ -538,15 +538,15 @@ or vector\<Point2f\> .
 a vector\<Point2f\> .
 @param method Method used to compute a homography matrix. The following methods are possible:
 -   **0** - a regular method using all the points, i.e., the least squares method
--   **RANSAC** - RANSAC-based robust method
--   **LMEDS** - Least-Median robust method
--   **RHO** - PROSAC-based robust method
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
+-   @ref RHO - PROSAC-based robust method
 @param ransacReprojThreshold Maximum allowed reprojection error to treat a point pair as an inlier
 (used in the RANSAC and RHO methods only). That is, if
 \f[\| \texttt{dstPoints} _i -  \texttt{convertPointsHomogeneous} ( \texttt{H} * \texttt{srcPoints} _i) \|_2  >  \texttt{ransacReprojThreshold}\f]
 then the point \f$i\f$ is considered as an outlier. If srcPoints and dstPoints are measured in pixels,
 it usually makes sense to set this parameter somewhere in the range of 1 to 10.
-@param mask Optional output mask set by a robust method ( RANSAC or LMEDS ). Note that the input
+@param mask Optional output mask set by a robust method ( RANSAC or LMeDS ). Note that the input
 mask values are ignored.
 @param maxIters The maximum number of RANSAC iterations.
 @param confidence Confidence level, between 0 and 1.
@@ -781,37 +781,37 @@ the model coordinate system to the camera coordinate system.
 the provided rvec and tvec values as initial approximations of the rotation and translation
 vectors, respectively, and further optimizes them.
 @param flags Method for solving a PnP problem:
--   **SOLVEPNP_ITERATIVE** Iterative method is based on a Levenberg-Marquardt optimization. In
+-   @ref SOLVEPNP_ITERATIVE Iterative method is based on a Levenberg-Marquardt optimization. In
 this case the function finds such a pose that minimizes reprojection error, that is the sum
 of squared distances between the observed projections imagePoints and the projected (using
 @ref projectPoints ) objectPoints .
--   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
+-   @ref SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
 "Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
 In this case the function requires exactly four object and image points.
--   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke, S. Roumeliotis
+-   @ref SOLVEPNP_AP3P Method is based on the paper of T. Ke, S. Roumeliotis
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
 In this case the function requires exactly four object and image points.
--   **SOLVEPNP_EPNP** Method has been introduced by F. Moreno-Noguer, V. Lepetit and P. Fua in the
+-   @ref SOLVEPNP_EPNP Method has been introduced by F. Moreno-Noguer, V. Lepetit and P. Fua in the
 paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
--   **SOLVEPNP_DLS** **Broken implementation. Using this flag will fallback to EPnP.** \n
+-   @ref SOLVEPNP_DLS **Broken implementation. Using this flag will fallback to EPnP.** \n
 Method is based on the paper of J. Hesch and S. Roumeliotis.
 "A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
--   **SOLVEPNP_UPNP** **Broken implementation. Using this flag will fallback to EPnP.** \n
+-   @ref SOLVEPNP_UPNP **Broken implementation. Using this flag will fallback to EPnP.** \n
 Method is based on the paper of A. Penate-Sanchez, J. Andrade-Cetto,
 F. Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
 Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
 assuming that both have the same value. Then the cameraMatrix is updated with the estimated
 focal length.
--   **SOLVEPNP_IPPE** Method is based on the paper of T. Collins and A. Bartoli.
+-   @ref SOLVEPNP_IPPE Method is based on the paper of T. Collins and A. Bartoli.
 "Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method requires coplanar object points.
--   **SOLVEPNP_IPPE_SQUARE** Method is based on the paper of Toby Collins and Adrien Bartoli.
+-   @ref SOLVEPNP_IPPE_SQUARE Method is based on the paper of Toby Collins and Adrien Bartoli.
 "Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method is suitable for marker pose estimation.
 It requires 4 coplanar object points defined in the following order:
   - point 0: [-squareLength / 2,  squareLength / 2, 0]
   - point 1: [ squareLength / 2,  squareLength / 2, 0]
   - point 2: [ squareLength / 2, -squareLength / 2, 0]
   - point 3: [-squareLength / 2, -squareLength / 2, 0]
--   **SOLVEPNP_SQPNP** Method is based on the paper "A Consistently Fast and Globally Optimal Solution to the
+-   @ref SOLVEPNP_SQPNP Method is based on the paper "A Consistently Fast and Globally Optimal Solution to the
 Perspective-n-Point Problem" by G. Terzakis and M.Lourakis (@cite Terzakis20). It requires 3 or more points.
 
 
@@ -921,23 +921,23 @@ a 3D point expressed in the world frame into the camera frame:
         - Thus, given some data D = np.array(...) where D.shape = (N,M), in order to use a subset of
         it as, e.g., imagePoints, one must effectively copy it into a new array: imagePoints =
         np.ascontiguousarray(D[:,:2]).reshape((N,1,2))
-   -   The methods **SOLVEPNP_DLS** and **SOLVEPNP_UPNP** cannot be used as the current implementations are
+   -   The methods @ref SOLVEPNP_DLS and @ref SOLVEPNP_UPNP cannot be used as the current implementations are
        unstable and sometimes give completely wrong results. If you pass one of these two
-       flags, **SOLVEPNP_EPNP** method will be used instead.
-   -   The minimum number of points is 4 in the general case. In the case of **SOLVEPNP_P3P** and **SOLVEPNP_AP3P**
+       flags, @ref SOLVEPNP_EPNP method will be used instead.
+   -   The minimum number of points is 4 in the general case. In the case of @ref SOLVEPNP_P3P and @ref SOLVEPNP_AP3P
        methods, it is required to use exactly 4 points (the first 3 points are used to estimate all the solutions
        of the P3P problem, the last one is used to retain the best solution that minimizes the reprojection error).
-   -   With **SOLVEPNP_ITERATIVE** method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
+   -   With @ref SOLVEPNP_ITERATIVE method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
        are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
        global solution to converge.
-   -   With **SOLVEPNP_IPPE** input points must be >= 4 and object points must be coplanar.
-   -   With **SOLVEPNP_IPPE_SQUARE** this is a special case suitable for marker pose estimation.
+   -   With @ref SOLVEPNP_IPPE input points must be >= 4 and object points must be coplanar.
+   -   With @ref SOLVEPNP_IPPE_SQUARE this is a special case suitable for marker pose estimation.
        Number of input points must be 4. Object points must be defined in the following order:
          - point 0: [-squareLength / 2,  squareLength / 2, 0]
          - point 1: [ squareLength / 2,  squareLength / 2, 0]
          - point 2: [ squareLength / 2, -squareLength / 2, 0]
          - point 3: [-squareLength / 2, -squareLength / 2, 0]
-    -  With **SOLVEPNP_SQPNP** input points must be >= 3
+    -  With @ref SOLVEPNP_SQPNP input points must be >= 3
  */
 CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
                             InputArray cameraMatrix, InputArray distCoeffs,
@@ -1015,9 +1015,9 @@ assumed.
 the model coordinate system to the camera coordinate system. A P3P problem has up to 4 solutions.
 @param tvecs Output translation vectors.
 @param flags Method for solving a P3P problem:
--   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
+-   @ref SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
 "Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
--   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke and S. Roumeliotis.
+-   @ref SOLVEPNP_AP3P Method is based on the paper of T. Ke and S. Roumeliotis.
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
 
 The function estimates the object pose given 3 object points, their corresponding image
@@ -1119,39 +1119,39 @@ the model coordinate system to the camera coordinate system.
 the provided rvec and tvec values as initial approximations of the rotation and translation
 vectors, respectively, and further optimizes them.
 @param flags Method for solving a PnP problem:
--   **SOLVEPNP_ITERATIVE** Iterative method is based on a Levenberg-Marquardt optimization. In
+-   @ref SOLVEPNP_ITERATIVE Iterative method is based on a Levenberg-Marquardt optimization. In
 this case the function finds such a pose that minimizes reprojection error, that is the sum
 of squared distances between the observed projections imagePoints and the projected (using
 projectPoints ) objectPoints .
--   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
+-   @ref SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
 "Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
 In this case the function requires exactly four object and image points.
--   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke, S. Roumeliotis
+-   @ref SOLVEPNP_AP3P Method is based on the paper of T. Ke, S. Roumeliotis
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
 In this case the function requires exactly four object and image points.
--   **SOLVEPNP_EPNP** Method has been introduced by F.Moreno-Noguer, V.Lepetit and P.Fua in the
+-   @ref SOLVEPNP_EPNP Method has been introduced by F.Moreno-Noguer, V.Lepetit and P.Fua in the
 paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
--   **SOLVEPNP_DLS** **Broken implementation. Using this flag will fallback to EPnP.** \n
+-   @ref SOLVEPNP_DLS **Broken implementation. Using this flag will fallback to EPnP.** \n
 Method is based on the paper of Joel A. Hesch and Stergios I. Roumeliotis.
 "A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
--   **SOLVEPNP_UPNP** **Broken implementation. Using this flag will fallback to EPnP.** \n
+-   @ref SOLVEPNP_UPNP **Broken implementation. Using this flag will fallback to EPnP.** \n
 Method is based on the paper of A.Penate-Sanchez, J.Andrade-Cetto,
 F.Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
 Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
 assuming that both have the same value. Then the cameraMatrix is updated with the estimated
 focal length.
--   **SOLVEPNP_IPPE** Method is based on the paper of T. Collins and A. Bartoli.
+-   @ref SOLVEPNP_IPPE Method is based on the paper of T. Collins and A. Bartoli.
 "Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method requires coplanar object points.
--   **SOLVEPNP_IPPE_SQUARE** Method is based on the paper of Toby Collins and Adrien Bartoli.
+-   @ref SOLVEPNP_IPPE_SQUARE Method is based on the paper of Toby Collins and Adrien Bartoli.
 "Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method is suitable for marker pose estimation.
 It requires 4 coplanar object points defined in the following order:
   - point 0: [-squareLength / 2,  squareLength / 2, 0]
   - point 1: [ squareLength / 2,  squareLength / 2, 0]
   - point 2: [ squareLength / 2, -squareLength / 2, 0]
   - point 3: [-squareLength / 2, -squareLength / 2, 0]
-@param rvec Rotation vector used to initialize an iterative PnP refinement algorithm, when flag is SOLVEPNP_ITERATIVE
+@param rvec Rotation vector used to initialize an iterative PnP refinement algorithm, when flag is @ref SOLVEPNP_ITERATIVE
 and useExtrinsicGuess is set to true.
-@param tvec Translation vector used to initialize an iterative PnP refinement algorithm, when flag is SOLVEPNP_ITERATIVE
+@param tvec Translation vector used to initialize an iterative PnP refinement algorithm, when flag is @ref SOLVEPNP_ITERATIVE
 and useExtrinsicGuess is set to true.
 @param reprojectionError Optional vector of reprojection error, that is the RMS error
 (\f$ \text{RMSE} = \sqrt{\frac{\sum_{i}^{N} \left ( \hat{y_i} - y_i \right )^2}{N}} \f$) between the input image points
@@ -1263,17 +1263,17 @@ a 3D point expressed in the world frame into the camera frame:
         - Thus, given some data D = np.array(...) where D.shape = (N,M), in order to use a subset of
         it as, e.g., imagePoints, one must effectively copy it into a new array: imagePoints =
         np.ascontiguousarray(D[:,:2]).reshape((N,1,2))
-   -   The methods **SOLVEPNP_DLS** and **SOLVEPNP_UPNP** cannot be used as the current implementations are
+   -   The methods @ref SOLVEPNP_DLS and @ref SOLVEPNP_UPNP cannot be used as the current implementations are
        unstable and sometimes give completely wrong results. If you pass one of these two
-       flags, **SOLVEPNP_EPNP** method will be used instead.
-   -   The minimum number of points is 4 in the general case. In the case of **SOLVEPNP_P3P** and **SOLVEPNP_AP3P**
+       flags, @ref SOLVEPNP_EPNP method will be used instead.
+   -   The minimum number of points is 4 in the general case. In the case of @ref SOLVEPNP_P3P and @ref SOLVEPNP_AP3P
        methods, it is required to use exactly 4 points (the first 3 points are used to estimate all the solutions
        of the P3P problem, the last one is used to retain the best solution that minimizes the reprojection error).
-   -   With **SOLVEPNP_ITERATIVE** method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
+   -   With @ref SOLVEPNP_ITERATIVE method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
        are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
        global solution to converge.
-   -   With **SOLVEPNP_IPPE** input points must be >= 4 and object points must be coplanar.
-   -   With **SOLVEPNP_IPPE_SQUARE** this is a special case suitable for marker pose estimation.
+   -   With @ref SOLVEPNP_IPPE input points must be >= 4 and object points must be coplanar.
+   -   With @ref SOLVEPNP_IPPE_SQUARE this is a special case suitable for marker pose estimation.
        Number of input points must be 4. Object points must be defined in the following order:
          - point 0: [-squareLength / 2,  squareLength / 2, 0]
          - point 1: [ squareLength / 2,  squareLength / 2, 0]
@@ -1427,8 +1427,8 @@ same camera intrinsic matrix. If this assumption does not hold for your use case
 to normalized image coordinates, which are valid for the identity camera intrinsic matrix. When
 passing these coordinates, pass the identity matrix for this parameter.
 @param method Method for computing an essential matrix.
--   **RANSAC** for the RANSAC algorithm.
--   **LMEDS** for the LMedS algorithm.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
 @param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
 confidence (probability) that the estimated matrix is correct.
 @param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
@@ -1437,6 +1437,7 @@ final fundamental matrix. It can be set to something like 1-3, depending on the
 point localization, image resolution, and the image noise.
 @param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
 for the other points. The array is computed only in the RANSAC and LMedS methods.
+@param maxIters The maximum number of robust method iterations.
 
 This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
 @cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:
@@ -1447,10 +1448,22 @@ where \f$E\f$ is an essential matrix, \f$p_1\f$ and \f$p_2\f$ are corresponding
 second images, respectively. The result of this function may be passed further to
 decomposeEssentialMat or recoverPose to recover the relative pose between cameras.
  */
-CV_EXPORTS_W Mat findEssentialMat( InputArray points1, InputArray points2,
-                                 InputArray cameraMatrix, int method = RANSAC,
-                                 double prob = 0.999, double threshold = 1.0,
-                                 OutputArray mask = noArray() );
+CV_EXPORTS_W
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    InputArray cameraMatrix, int method = RANSAC,
+    double prob = 0.999, double threshold = 1.0,
+    int maxIters = 1000, OutputArray mask = noArray()
+);
+
+/** @overload */
+CV_EXPORTS
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    InputArray cameraMatrix, int method,
+    double prob, double threshold,
+    OutputArray mask
+);  // TODO remove from OpenCV 5.0
 
 /** @overload
 @param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
@@ -1460,8 +1473,8 @@ be floating-point (single or double precision).
 are feature points from cameras with same focal length and principal point.
 @param pp principal point of the camera.
 @param method Method for computing a fundamental matrix.
--   **RANSAC** for the RANSAC algorithm.
--   **LMEDS** for the LMedS algorithm.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
 @param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
 line in pixels, beyond which the point is considered an outlier and is not used for computing the
 final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
@@ -1470,6 +1483,7 @@ point localization, image resolution, and the image noise.
 confidence (probability) that the estimated matrix is correct.
 @param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
 for the other points. The array is computed only in the RANSAC and LMedS methods.
+@param maxIters The maximum number of robust method iterations.
 
 This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:
@@ -1481,10 +1495,23 @@ f & 0 & x_{pp}  \\
 0 & 0 & 1
 \end{bmatrix}\f]
  */
-CV_EXPORTS_W Mat findEssentialMat( InputArray points1, InputArray points2,
-                                 double focal = 1.0, Point2d pp = Point2d(0, 0),
-                                 int method = RANSAC, double prob = 0.999,
-                                 double threshold = 1.0, OutputArray mask = noArray() );
+CV_EXPORTS_W
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    double focal = 1.0, Point2d pp = Point2d(0, 0),
+    int method = RANSAC, double prob = 0.999,
+    double threshold = 1.0, int maxIters = 1000,
+    OutputArray mask = noArray()
+);
+
+/** @overload */
+CV_EXPORTS
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    double focal, Point2d pp,
+    int method, double prob,
+    double threshold, OutputArray mask
+);  // TODO remove from OpenCV 5.0
 
 /** @brief Calculates an essential matrix from the corresponding points in two images from potentially two different cameras.
 
@@ -1510,8 +1537,8 @@ of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
 @param method Method for computing an essential matrix.
--   **RANSAC** for the RANSAC algorithm.
--   **LMEDS** for the LMedS algorithm.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
 @param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
 confidence (probability) that the estimated matrix is correct.
 @param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
@@ -1900,8 +1927,8 @@ b_2\\
 @param to Second input 2D point set containing \f$(x,y)\f$.
 @param inliers Output vector indicating which points are inliers (1-inlier, 0-outlier).
 @param method Robust method used to compute transformation. The following methods are possible:
--   cv::RANSAC - RANSAC-based robust method
--   cv::LMEDS - Least-Median robust method
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
 RANSAC is the default method.
 @param ransacReprojThreshold Maximum reprojection error in the RANSAC algorithm to consider
 a point as an inlier. Applies only to RANSAC.
@@ -1950,8 +1977,8 @@ two 2D point sets.
 @param to Second input 2D point set.
 @param inliers Output vector indicating which points are inliers.
 @param method Robust method used to compute transformation. The following methods are possible:
--   cv::RANSAC - RANSAC-based robust method
--   cv::LMEDS - Least-Median robust method
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
 RANSAC is the default method.
 @param ransacReprojThreshold Maximum reprojection error in the RANSAC algorithm to consider
 a point as an inlier. Applies only to RANSAC.
diff --git a/modules/3d/src/five-point.cpp b/modules/3d/src/five-point.cpp
index c7339d8f7c..084f791f26 100644
--- a/modules/3d/src/five-point.cpp
+++ b/modules/3d/src/five-point.cpp
@@ -401,7 +401,8 @@ protected:
 
 // Input should be a vector of n 2D points or a Nx2 matrix
 Mat findEssentialMat( InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
-                              int method, double prob, double threshold, OutputArray _mask)
+                              int method, double prob, double threshold,
+                              int maxIters, OutputArray _mask)
 {
     CV_INSTRUMENT_REGION();
 
@@ -444,20 +445,36 @@ Mat findEssentialMat( InputArray _points1, InputArray _points2, InputArray _came
 
     Mat E;
     if( method == RANSAC )
-        createRANSACPointSetRegistrator(makePtr<EMEstimatorCallback>(), 5, threshold, prob)->run(points1, points2, E, _mask);
+        createRANSACPointSetRegistrator(makePtr<EMEstimatorCallback>(), 5, threshold, prob, maxIters)->run(points1, points2, E, _mask);
     else
-        createLMeDSPointSetRegistrator(makePtr<EMEstimatorCallback>(), 5, prob)->run(points1, points2, E, _mask);
+        createLMeDSPointSetRegistrator(makePtr<EMEstimatorCallback>(), 5, prob, maxIters)->run(points1, points2, E, _mask);
 
     return E;
 }
 
+Mat findEssentialMat( InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
+                              int method, double prob, double threshold,
+                              OutputArray _mask)
+{
+    return findEssentialMat(_points1, _points2, _cameraMatrix, method, prob, threshold, 1000, _mask);
+}
+
+Mat findEssentialMat( InputArray _points1, InputArray _points2, double focal, Point2d pp,
+                              int method, double prob, double threshold, int maxIters, OutputArray _mask)
+{
+    CV_INSTRUMENT_REGION();
+
+    Mat cameraMatrix = (Mat_<double>(3,3) << focal, 0, pp.x, 0, focal, pp.y, 0, 0, 1);
+    return findEssentialMat(_points1, _points2, cameraMatrix, method, prob, threshold, maxIters, _mask);
+}
+
 Mat findEssentialMat( InputArray _points1, InputArray _points2, double focal, Point2d pp,
                               int method, double prob, double threshold, OutputArray _mask)
 {
     CV_INSTRUMENT_REGION();
 
     Mat cameraMatrix = (Mat_<double>(3,3) << focal, 0, pp.x, 0, focal, pp.y, 0, 0, 1);
-    return findEssentialMat(_points1, _points2, cameraMatrix, method, prob, threshold, _mask);
+    return findEssentialMat(_points1, _points2, cameraMatrix, method, prob, threshold, 1000, _mask);
 }
 
 Mat findEssentialMat( InputArray _points1, InputArray _points2,
diff --git a/modules/3d/src/fundam.cpp b/modules/3d/src/fundam.cpp
index 853c304c81..921db14e34 100644
--- a/modules/3d/src/fundam.cpp
+++ b/modules/3d/src/fundam.cpp
@@ -878,7 +878,7 @@ Mat findFundamentalMat( InputArray _points1, InputArray _points2,
         if( (method & ~3) == FM_RANSAC && npoints >= 15 )
             result = createRANSACPointSetRegistrator(cb, 7, ransacReprojThreshold, confidence, maxIters)->run(m1, m2, F, _mask);
         else
-            result = createLMeDSPointSetRegistrator(cb, 7, confidence)->run(m1, m2, F, _mask);
+            result = createLMeDSPointSetRegistrator(cb, 7, confidence, maxIters)->run(m1, m2, F, _mask);
     }
 
     if( result <= 0 )
diff --git a/modules/3d/src/polynom_solver.cpp b/modules/3d/src/polynom_solver.cpp
index 5eb04b7bf4..201d09c439 100644
--- a/modules/3d/src/polynom_solver.cpp
+++ b/modules/3d/src/polynom_solver.cpp
@@ -71,7 +71,8 @@ int solve_deg3(double a, double b, double c, double d,
       return 3;
     }
     else {
-      x0 = pow(2 * R, 1 / 3.0) - b_a_3;
+      double cube_root = cv::cubeRoot(2 * R);
+      x0 = cube_root - b_a_3;
       return 1;
     }
   }
@@ -88,8 +89,15 @@ int solve_deg3(double a, double b, double c, double d,
   }
 
   // D > 0, only one real root
-  double AD = pow(fabs(R) + sqrt(D), 1.0 / 3.0) * (R > 0 ? 1 : (R < 0 ? -1 : 0));
-  double BD = (AD == 0) ? 0 : -Q / AD;
+  double AD = 0.;
+  double BD = 0.;
+  double R_abs = fabs(R);
+  if (R_abs > DBL_EPSILON)
+  {
+    AD = cv::cubeRoot(R_abs + sqrt(D));
+    AD = (R >= 0) ? AD : -AD;
+    BD = -Q / AD;
+  }
 
   // Calculate the only real root
   x0 = AD + BD - b_a_3;
diff --git a/modules/3d/src/solvepnp.cpp b/modules/3d/src/solvepnp.cpp
index 01e8e9ac57..03fb6f88c0 100644
--- a/modules/3d/src/solvepnp.cpp
+++ b/modules/3d/src/solvepnp.cpp
@@ -334,18 +334,42 @@ bool solvePnPRansac(InputArray _opoints, InputArray _ipoints,
 
     opoints_inliers.resize(npoints1);
     ipoints_inliers.resize(npoints1);
-    result = solvePnP(opoints_inliers, ipoints_inliers, cameraMatrix,
-                      distCoeffs, rvec, tvec, useExtrinsicGuess,
-                      (flags == SOLVEPNP_P3P || flags == SOLVEPNP_AP3P) ? SOLVEPNP_EPNP : flags) ? 1 : -1;
+    try
+    {
+        result = solvePnP(opoints_inliers, ipoints_inliers, cameraMatrix,
+                          distCoeffs, rvec, tvec, useExtrinsicGuess,
+                          (flags == SOLVEPNP_P3P || flags == SOLVEPNP_AP3P) ? SOLVEPNP_EPNP : flags) ? 1 : -1;
+    }
+    catch (const cv::Exception& e)
+    {
+        if (flags == SOLVEPNP_ITERATIVE &&
+            npoints1 == 5 &&
+            e.what() &&
+            std::string(e.what()).find("DLT algorithm needs at least 6 points") != std::string::npos
+        )
+        {
+            CV_LOG_INFO(NULL, "solvePnPRansac(): solvePnP stage to compute the final pose using points "
+                "in the consensus set raised DLT 6 points exception, use result from MSS (Minimal Sample Sets) stage instead.");
+            rvec = _local_model.col(0);    // output rotation vector
+            tvec = _local_model.col(1);    // output translation vector
+            result = 1;
+        }
+        else
+        {
+            // raise other exceptions
+            throw;
+        }
+    }
 
-    if( result <= 0 )
+    if (result <= 0)
     {
         _rvec.assign(_local_model.col(0));    // output rotation vector
         _tvec.assign(_local_model.col(1));    // output translation vector
 
-        if( _inliers.needed() )
+        if (_inliers.needed())
             _inliers.release();
 
+        CV_LOG_DEBUG(NULL, "solvePnPRansac(): solvePnP stage to compute the final pose using points in the consensus set failed. Return false");
         return false;
     }
     else
diff --git a/modules/3d/src/sqpnp.hpp b/modules/3d/src/sqpnp.hpp
index f8136324c9..97c10e34e7 100644
--- a/modules/3d/src/sqpnp.hpp
+++ b/modules/3d/src/sqpnp.hpp
@@ -72,6 +72,7 @@ private:
         cv::Matx<double, 9, 1> r_hat;
         cv::Matx<double, 3, 1> t;
         double sq_error;
+        SQPSolution() : sq_error(0) {}
     };
 
     /*
diff --git a/modules/3d/src/usac/estimator.cpp b/modules/3d/src/usac/estimator.cpp
index 91abe30512..75bc3cf5dd 100644
--- a/modules/3d/src/usac/estimator.cpp
+++ b/modules/3d/src/usac/estimator.cpp
@@ -236,13 +236,18 @@ public:
         CV_DbgAssert(points);
     }
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters(const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const m = (double *) model.data;
         m11=static_cast<float>(m[0]); m12=static_cast<float>(m[1]); m13=static_cast<float>(m[2]);
         m21=static_cast<float>(m[3]); m22=static_cast<float>(m[4]); m23=static_cast<float>(m[5]);
         m31=static_cast<float>(m[6]); m32=static_cast<float>(m[7]); m33=static_cast<float>(m[8]);
 
         const Mat model_inv = model.inv();
+        CV_CheckTypeEQ(model_inv.depth(), CV_64F, "");
         const auto * const minv = (double *) model_inv.data;
         minv11=(float)minv[0]; minv12=(float)minv[1]; minv13=(float)minv[2];
         minv21=(float)minv[3]; minv22=(float)minv[4]; minv23=(float)minv[5];
@@ -299,7 +304,11 @@ public:
         CV_DbgAssert(points);
     }
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters(const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const m = (double *) model.data;
         m11=static_cast<float>(m[0]); m12=static_cast<float>(m[1]); m13=static_cast<float>(m[2]);
         m21=static_cast<float>(m[3]); m22=static_cast<float>(m[4]); m23=static_cast<float>(m[5]);
@@ -349,7 +358,11 @@ public:
         CV_DbgAssert(points);
     }
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters(const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const m = (double *) model.data;
         m11=static_cast<float>(m[0]); m12=static_cast<float>(m[1]); m13=static_cast<float>(m[2]);
         m21=static_cast<float>(m[3]); m22=static_cast<float>(m[4]); m23=static_cast<float>(m[5]);
@@ -416,7 +429,11 @@ public:
         CV_DbgAssert(points);
     }
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters(const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const m = (double *) model.data;
         m11=static_cast<float>(m[0]); m12=static_cast<float>(m[1]); m13=static_cast<float>(m[2]);
         m21=static_cast<float>(m[3]); m22=static_cast<float>(m[4]); m23=static_cast<float>(m[5]);
@@ -476,7 +493,11 @@ public:
     }
 
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters (const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const p = (double *) model.data;
         p11 = (float)p[0]; p12 = (float)p[1]; p13 = (float)p[2];  p14 = (float)p[3];
         p21 = (float)p[4]; p22 = (float)p[5]; p23 = (float)p[6];  p24 = (float)p[7];
@@ -535,7 +556,11 @@ public:
         CV_DbgAssert(points);
     }
 
-    inline void setModelParameters (const Mat &model) override {
+    inline void setModelParameters(const Mat& model) override
+    {
+        CV_Assert(!model.empty());
+        CV_CheckTypeEQ(model.depth(), CV_64F, "");
+
         const auto * const m = (double *) model.data;
         m11 = (float)m[0]; m12 = (float)m[1]; m13 = (float)m[2];
         m21 = (float)m[3]; m22 = (float)m[4]; m23 = (float)m[5];
diff --git a/modules/3d/src/usac/quality.cpp b/modules/3d/src/usac/quality.cpp
index f29a5cfbb6..835306c4f4 100644
--- a/modules/3d/src/usac/quality.cpp
+++ b/modules/3d/src/usac/quality.cpp
@@ -421,7 +421,11 @@ public:
      * @current_hypothesis: current RANSAC iteration
      * Return: true if model is good, false - otherwise.
      */
-    inline bool isModelGood (const Mat &model) override {
+    inline bool isModelGood(const Mat& model) override
+    {
+        if (model.empty())
+            return false;
+
         // update error object with current model
         err->setModelParameters(model);
 
diff --git a/modules/3d/test/test_solvepnp_ransac.cpp b/modules/3d/test/test_solvepnp_ransac.cpp
index fb0e2965e6..43b90dff92 100644
--- a/modules/3d/test/test_solvepnp_ransac.cpp
+++ b/modules/3d/test/test_solvepnp_ransac.cpp
@@ -837,6 +837,43 @@ TEST(Calib3d_SolvePnPRansac, double_support)
     EXPECT_LE(cvtest::norm(t, Mat_<double>(tF), NORM_INF), 1e-3);
 }
 
+TEST(Calib3d_SolvePnPRansac, bad_input_points_19253)
+{
+    // with this specific data
+    // when computing the final pose using points in the consensus set with SOLVEPNP_ITERATIVE and solvePnP()
+    // an exception is thrown from solvePnP because there are 5 non-coplanar 3D points and the DLT algorithm needs at least 6 non-coplanar 3D points
+    // with PR #19253 we choose to return true, with the pose estimated from the MSS stage instead of throwing the exception
+
+    float pts2d_[] = {
+        -5.38358629e-01f, -5.09638414e-02f,
+        -5.07192254e-01f, -2.20743284e-01f,
+        -5.43107152e-01f, -4.90474701e-02f,
+        -5.54325163e-01f, -1.86715424e-01f,
+        -5.59334219e-01f, -4.01909500e-02f,
+        -5.43504596e-01f, -4.61776406e-02f
+    };
+    Mat pts2d(6, 2, CV_32FC1, pts2d_);
+
+    float pts3d_[] = {
+        -3.01153604e-02f, -1.55665115e-01f, 4.50000018e-01f,
+        4.27827090e-01f, 4.28645730e-01f, 1.08600008e+00f,
+        -3.14165242e-02f, -1.52656138e-01f, 4.50000018e-01f,
+        -1.46217480e-01f, 5.57961613e-02f, 7.17000008e-01f,
+        -4.89348806e-02f, -1.38795510e-01f, 4.47000027e-01f,
+        -3.13065052e-02f, -1.52636901e-01f, 4.51000035e-01f
+    };
+    Mat pts3d(6, 3, CV_32FC1, pts3d_);
+
+    Mat camera_mat = Mat::eye(3, 3, CV_64FC1);
+    Mat rvec, tvec;
+    vector<int> inliers;
+
+    // solvePnPRansac will return true with 5 inliers, which means the result is from MSS stage.
+    bool result = solvePnPRansac(pts3d, pts2d, camera_mat, noArray(), rvec, tvec, false, 100, 4.f / 460.f, 0.99, inliers);
+    EXPECT_EQ(inliers.size(), size_t(5));
+    EXPECT_TRUE(result);
+}
+
 TEST(Calib3d_SolvePnP, input_type)
 {
     Matx33d intrinsics(5.4794130238156129e+002, 0., 2.9835545700043139e+002, 0.,
diff --git a/modules/3d/test/test_usac.cpp b/modules/3d/test/test_usac.cpp
index 0b5cfde182..fb5641bd1e 100644
--- a/modules/3d/test/test_usac.cpp
+++ b/modules/3d/test/test_usac.cpp
@@ -4,7 +4,8 @@
 
 #include "test_precomp.hpp"
 
-namespace opencv_test {
+namespace opencv_test { namespace {
+
 enum TestSolver { Homogr, Fundam, Essen, PnP, Affine};
 /*
 * rng -- reference to random generator
@@ -264,7 +265,40 @@ TEST(usac_Fundamental, accuracy) {
                                                            int(max_iters), mask);
             checkInliersMask(TestSolver::Fundam, inl_size, thr, pts1, pts2, F, mask);
         }
-    }}
+    }
+}
+
+TEST(usac_Fundamental, regression_19639)
+{
+    double x_[] = {
+        941, 890,
+        596, 940,
+        898, 941,
+        894, 933,
+        586, 938,
+        902, 933,
+        887, 935
+    };
+    Mat x(7, 1, CV_64FC2, x_);
+
+    double y_[] = {
+        1416,  806,
+        1157,  852,
+        1380,  855,
+        1378,  843,
+        1145,  849,
+        1378,  843,
+        1378,  843
+    };
+    Mat y(7, 1, CV_64FC2, y_);
+
+    //std::cout << x << std::endl;
+    //std::cout << y << std::endl;
+
+    Mat m = cv::findFundamentalMat(x, y, USAC_MAGSAC, 3, 0.99);
+    EXPECT_TRUE(m.empty());
+}
+
 
 TEST(usac_Essential, accuracy) {
     std::vector<int> gt_inliers;
@@ -405,4 +439,5 @@ TEST(usac_testUsacParams, accuracy) {
     checkInliersMask(TestSolver::Homogr, inl_size, usac_params.threshold, pts1, pts2, model, mask);
 }
 
-}
+
+}}  // namespace
diff --git a/modules/calib/include/opencv2/calib.hpp b/modules/calib/include/opencv2/calib.hpp
index 945db3e87b..efcdd5d9e1 100644
--- a/modules/calib/include/opencv2/calib.hpp
+++ b/modules/calib/include/opencv2/calib.hpp
@@ -52,7 +52,7 @@ respectively) by the same factor.
 
 The joint rotation-translation matrix \f$[R|t]\f$ is the matrix product of a projective
 transformation and a homogeneous transformation. The 3-by-4 projective transformation maps 3D points
-represented in camera coordinates to 2D poins in the image plane and represented in normalized
+represented in camera coordinates to 2D points in the image plane and represented in normalized
 camera coordinates \f$x' = X_c / Z_c\f$ and \f$y' = Y_c / Z_c\f$:
 
 \f[Z_c \begin{bmatrix}
@@ -484,13 +484,13 @@ CV_EXPORTS_W Mat initCameraMatrix2D( InputArrayOfArrays objectPoints,
 ( patternSize = cv::Size(points_per_row,points_per_colum) = cv::Size(columns,rows) ).
 @param corners Output array of detected corners.
 @param flags Various operation flags that can be zero or a combination of the following values:
--   **CALIB_CB_ADAPTIVE_THRESH** Use adaptive thresholding to convert the image to black
+-   @ref CALIB_CB_ADAPTIVE_THRESH Use adaptive thresholding to convert the image to black
 and white, rather than a fixed threshold level (computed from the average image brightness).
--   **CALIB_CB_NORMALIZE_IMAGE** Normalize the image gamma with equalizeHist before
+-   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with equalizeHist before
 applying fixed or adaptive thresholding.
--   **CALIB_CB_FILTER_QUADS** Use additional criteria (like contour area, perimeter,
+-   @ref CALIB_CB_FILTER_QUADS Use additional criteria (like contour area, perimeter,
 square-like shape) to filter out false quads extracted at the contour retrieval stage.
--   **CALIB_CB_FAST_CHECK** Run a fast check on the image that looks for chessboard corners,
+-   @ref CALIB_CB_FAST_CHECK Run a fast check on the image that looks for chessboard corners,
 and shortcut the call if none is found. This can drastically speed up the call in the
 degenerate condition when no chessboard is observed.
 
@@ -542,11 +542,11 @@ CV_EXPORTS_W bool checkChessboard(InputArray img, Size size);
 ( patternSize = cv::Size(points_per_row,points_per_colum) = cv::Size(columns,rows) ).
 @param corners Output array of detected corners.
 @param flags Various operation flags that can be zero or a combination of the following values:
--   **CALIB_CB_NORMALIZE_IMAGE** Normalize the image gamma with equalizeHist before detection.
--   **CALIB_CB_EXHAUSTIVE** Run an exhaustive search to improve detection rate.
--   **CALIB_CB_ACCURACY** Up sample input image to improve sub-pixel accuracy due to aliasing effects.
--   **CALIB_CB_LARGER** The detected pattern is allowed to be larger than patternSize (see description).
--   **CALIB_CB_MARKER** The detected pattern must have a marker (see description).
+-   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with equalizeHist before detection.
+-   @ref CALIB_CB_EXHAUSTIVE Run an exhaustive search to improve detection rate.
+-   @ref CALIB_CB_ACCURACY Up sample input image to improve sub-pixel accuracy due to aliasing effects.
+-   @ref CALIB_CB_LARGER The detected pattern is allowed to be larger than patternSize (see description).
+-   @ref CALIB_CB_MARKER The detected pattern must have a marker (see description).
 This should be used if an accurate camera calibration is required.
 @param meta Optional output arrray of detected corners (CV_8UC1 and size = cv::Size(columns,rows)).
 Each entry stands for one corner of the pattern and can have one of the following values:
@@ -565,7 +565,7 @@ Calibration" demonstrating that the returned sub-pixel positions are more
 accurate than the one returned by cornerSubPix allowing a precise camera
 calibration for demanding applications.
 
-In the case, the flags **CALIB_CB_LARGER** or **CALIB_CB_MARKER** are given,
+In the case, the flags @ref CALIB_CB_LARGER or @ref CALIB_CB_MARKER are given,
 the result can be recovered from the optional meta array. Both flags are
 helpful to use calibration patterns exceeding the field of view of the camera.
 These oversized patterns allow more accurate calibrations as corners can be
@@ -682,11 +682,12 @@ typedef CirclesGridFinderParameters CirclesGridFinderParameters2;
 ( patternSize = Size(points_per_row, points_per_colum) ).
 @param centers output array of detected centers.
 @param flags various operation flags that can be one of the following values:
--   **CALIB_CB_SYMMETRIC_GRID** uses symmetric pattern of circles.
--   **CALIB_CB_ASYMMETRIC_GRID** uses asymmetric pattern of circles.
--   **CALIB_CB_CLUSTERING** uses a special algorithm for grid detection. It is more robust to
+-   @ref CALIB_CB_SYMMETRIC_GRID uses symmetric pattern of circles.
+-   @ref CALIB_CB_ASYMMETRIC_GRID uses asymmetric pattern of circles.
+-   @ref CALIB_CB_CLUSTERING uses a special algorithm for grid detection. It is more robust to
 perspective distortions but much more sensitive to background clutter.
 @param blobDetector feature detector that finds blobs like dark circles on light background.
+                    If `blobDetector` is NULL then `image` represents Point2f array of candidates.
 @param parameters struct for finding circles in a grid pattern.
 
 The function attempts to determine whether the input image contains a grid of circles. If it is, the
@@ -697,7 +698,7 @@ row). Otherwise, if the function fails to find all the corners or reorder them,
 Sample usage of detecting and drawing the centers of circles: :
 @code
     Size patternsize(7,7); //number of centers
-    Mat gray = ....; //source image
+    Mat gray = ...; //source image
     vector<Point2f> centers; //this will be filled by the detected centers
 
     bool patternfound = findCirclesGrid(gray, patternsize, centers);
@@ -736,8 +737,8 @@ respectively. In the old interface all the vectors of object points from differe
 concatenated together.
 @param imageSize Size of the image used only to initialize the camera intrinsic matrix.
 @param cameraMatrix Input/output 3x3 floating-point camera intrinsic matrix
-\f$\cameramatrix{A}\f$ . If CV\_CALIB\_USE\_INTRINSIC\_GUESS
-and/or CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy must be
+\f$\cameramatrix{A}\f$ . If @ref CALIB_USE_INTRINSIC_GUESS
+and/or @ref CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy must be
 initialized before calling the function.
 @param distCoeffs Input/output vector of distortion coefficients
 \f$\distcoeffs\f$.
@@ -760,40 +761,40 @@ parameters. Order of deviations values: \f$(R_0, T_0, \dotsc , R_{M - 1}, T_{M -
 the number of pattern views. \f$R_i, T_i\f$ are concatenated 1x3 vectors.
  @param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
 @param flags Different flags that may be zero or a combination of the following values:
--   **CALIB_USE_INTRINSIC_GUESS** cameraMatrix contains valid initial values of
+-   @ref CALIB_USE_INTRINSIC_GUESS cameraMatrix contains valid initial values of
 fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
 center ( imageSize is used), and focal distances are computed in a least-squares fashion.
 Note, that if intrinsic parameters are known, there is no need to use this function just to
 estimate extrinsic parameters. Use solvePnP instead.
--   **CALIB_FIX_PRINCIPAL_POINT** The principal point is not changed during the global
+-   @ref CALIB_FIX_PRINCIPAL_POINT The principal point is not changed during the global
 optimization. It stays at the center or at a different location specified when
-CALIB_USE_INTRINSIC_GUESS is set too.
--   **CALIB_FIX_ASPECT_RATIO** The functions consider only fy as a free parameter. The
+ @ref CALIB_USE_INTRINSIC_GUESS is set too.
+-   @ref CALIB_FIX_ASPECT_RATIO The functions consider only fy as a free parameter. The
 ratio fx/fy stays the same as in the input cameraMatrix . When
-CALIB_USE_INTRINSIC_GUESS is not set, the actual input values of fx and fy are
+ @ref CALIB_USE_INTRINSIC_GUESS is not set, the actual input values of fx and fy are
 ignored, only their ratio is computed and used further.
--   **CALIB_ZERO_TANGENT_DIST** Tangential distortion coefficients \f$(p_1, p_2)\f$ are set
+-   @ref CALIB_ZERO_TANGENT_DIST Tangential distortion coefficients \f$(p_1, p_2)\f$ are set
 to zeros and stay zero.
--   **CALIB_FIX_K1,...,CALIB_FIX_K6** The corresponding radial distortion
-coefficient is not changed during the optimization. If CALIB_USE_INTRINSIC_GUESS is
+-   @ref CALIB_FIX_K1,..., @ref CALIB_FIX_K6 The corresponding radial distortion
+coefficient is not changed during the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is
 set, the coefficient from the supplied distCoeffs matrix is used. Otherwise, it is set to 0.
--   **CALIB_RATIONAL_MODEL** Coefficients k4, k5, and k6 are enabled. To provide the
+-   @ref CALIB_RATIONAL_MODEL Coefficients k4, k5, and k6 are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
 calibration function use the rational model and return 8 coefficients. If the flag is not
 set, the function computes and returns only 5 distortion coefficients.
--   **CALIB_THIN_PRISM_MODEL** Coefficients s1, s2, s3 and s4 are enabled. To provide the
+-   @ref CALIB_THIN_PRISM_MODEL Coefficients s1, s2, s3 and s4 are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
 calibration function use the thin prism model and return 12 coefficients. If the flag is not
 set, the function computes and returns only 5 distortion coefficients.
--   **CALIB_FIX_S1_S2_S3_S4** The thin prism distortion coefficients are not changed during
-the optimization. If CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+-   @ref CALIB_FIX_S1_S2_S3_S4 The thin prism distortion coefficients are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
--   **CALIB_TILTED_MODEL** Coefficients tauX and tauY are enabled. To provide the
+-   @ref CALIB_TILTED_MODEL Coefficients tauX and tauY are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
 calibration function use the tilted sensor model and return 14 coefficients. If the flag is not
 set, the function computes and returns only 5 distortion coefficients.
--   **CALIB_FIX_TAUX_TAUY** The coefficients of the tilted sensor model are not changed during
-the optimization. If CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+-   @ref CALIB_FIX_TAUX_TAUY The coefficients of the tilted sensor model are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
 @param criteria Termination criteria for the iterative optimization algorithm.
 
@@ -805,7 +806,7 @@ points and their corresponding 2D projections in each view must be specified. Th
 by using an object with known geometry and easily detectable feature points. Such an object is
 called a calibration rig or calibration pattern, and OpenCV has built-in support for a chessboard as
 a calibration rig (see @ref findChessboardCorners). Currently, initialization of intrinsic
-parameters (when CALIB_USE_INTRINSIC_GUESS is not set) is only implemented for planar calibration
+parameters (when @ref CALIB_USE_INTRINSIC_GUESS is not set) is only implemented for planar calibration
 patterns (where Z-coordinates of the object points must be all zeros). 3D calibration rigs can also
 be used as long as initial cameraMatrix is provided.
 
@@ -988,39 +989,39 @@ second camera coordinate system.
 @param F Output fundamental matrix.
 @param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
 @param flags Different flags that may be zero or a combination of the following values:
--   **CALIB_FIX_INTRINSIC** Fix cameraMatrix? and distCoeffs? so that only R, T, E, and F
+-   @ref CALIB_FIX_INTRINSIC Fix cameraMatrix? and distCoeffs? so that only R, T, E, and F
 matrices are estimated.
--   **CALIB_USE_INTRINSIC_GUESS** Optimize some or all of the intrinsic parameters
+-   @ref CALIB_USE_INTRINSIC_GUESS Optimize some or all of the intrinsic parameters
 according to the specified flags. Initial values are provided by the user.
--   **CALIB_USE_EXTRINSIC_GUESS** R and T contain valid initial values that are optimized further.
+-   @ref CALIB_USE_EXTRINSIC_GUESS R and T contain valid initial values that are optimized further.
 Otherwise R and T are initialized to the median value of the pattern views (each dimension separately).
--   **CALIB_FIX_PRINCIPAL_POINT** Fix the principal points during the optimization.
--   **CALIB_FIX_FOCAL_LENGTH** Fix \f$f^{(j)}_x\f$ and \f$f^{(j)}_y\f$ .
--   **CALIB_FIX_ASPECT_RATIO** Optimize \f$f^{(j)}_y\f$ . Fix the ratio \f$f^{(j)}_x/f^{(j)}_y\f$
+-   @ref CALIB_FIX_PRINCIPAL_POINT Fix the principal points during the optimization.
+-   @ref CALIB_FIX_FOCAL_LENGTH Fix \f$f^{(j)}_x\f$ and \f$f^{(j)}_y\f$ .
+-   @ref CALIB_FIX_ASPECT_RATIO Optimize \f$f^{(j)}_y\f$ . Fix the ratio \f$f^{(j)}_x/f^{(j)}_y\f$
 .
--   **CALIB_SAME_FOCAL_LENGTH** Enforce \f$f^{(0)}_x=f^{(1)}_x\f$ and \f$f^{(0)}_y=f^{(1)}_y\f$ .
--   **CALIB_ZERO_TANGENT_DIST** Set tangential distortion coefficients for each camera to
+-   @ref CALIB_SAME_FOCAL_LENGTH Enforce \f$f^{(0)}_x=f^{(1)}_x\f$ and \f$f^{(0)}_y=f^{(1)}_y\f$ .
+-   @ref CALIB_ZERO_TANGENT_DIST Set tangential distortion coefficients for each camera to
 zeros and fix there.
--   **CALIB_FIX_K1,...,CALIB_FIX_K6** Do not change the corresponding radial
-distortion coefficient during the optimization. If CALIB_USE_INTRINSIC_GUESS is set,
+-   @ref CALIB_FIX_K1,..., @ref CALIB_FIX_K6 Do not change the corresponding radial
+distortion coefficient during the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set,
 the coefficient from the supplied distCoeffs matrix is used. Otherwise, it is set to 0.
--   **CALIB_RATIONAL_MODEL** Enable coefficients k4, k5, and k6. To provide the backward
+-   @ref CALIB_RATIONAL_MODEL Enable coefficients k4, k5, and k6. To provide the backward
 compatibility, this extra flag should be explicitly specified to make the calibration
 function use the rational model and return 8 coefficients. If the flag is not set, the
 function computes and returns only 5 distortion coefficients.
--   **CALIB_THIN_PRISM_MODEL** Coefficients s1, s2, s3 and s4 are enabled. To provide the
+-   @ref CALIB_THIN_PRISM_MODEL Coefficients s1, s2, s3 and s4 are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
 calibration function use the thin prism model and return 12 coefficients. If the flag is not
 set, the function computes and returns only 5 distortion coefficients.
--   **CALIB_FIX_S1_S2_S3_S4** The thin prism distortion coefficients are not changed during
-the optimization. If CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+-   @ref CALIB_FIX_S1_S2_S3_S4 The thin prism distortion coefficients are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
--   **CALIB_TILTED_MODEL** Coefficients tauX and tauY are enabled. To provide the
+-   @ref CALIB_TILTED_MODEL Coefficients tauX and tauY are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
 calibration function use the tilted sensor model and return 14 coefficients. If the flag is not
 set, the function computes and returns only 5 distortion coefficients.
--   **CALIB_FIX_TAUX_TAUY** The coefficients of the tilted sensor model are not changed during
-the optimization. If CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+-   @ref CALIB_FIX_TAUX_TAUY The coefficients of the tilted sensor model are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
 @param criteria Termination criteria for the iterative optimization algorithm.
 
@@ -1068,10 +1069,10 @@ Besides the stereo-related information, the function can also perform a full cal
 the two cameras. However, due to the high dimensionality of the parameter space and noise in the
 input data, the function can diverge from the correct solution. If the intrinsic parameters can be
 estimated with high accuracy for each of the cameras individually (for example, using
-calibrateCamera ), you are recommended to do so and then pass CALIB_FIX_INTRINSIC flag to the
+calibrateCamera ), you are recommended to do so and then pass @ref CALIB_FIX_INTRINSIC flag to the
 function along with the computed intrinsic parameters. Otherwise, if all the parameters are
 estimated at once, it makes sense to restrict some parameters, for example, pass
-CALIB_SAME_FOCAL_LENGTH and CALIB_ZERO_TANGENT_DIST flags, which is usually a
+ @ref CALIB_SAME_FOCAL_LENGTH and @ref CALIB_ZERO_TANGENT_DIST flags, which is usually a
 reasonable assumption.
 
 Similarly to calibrateCamera, the function minimizes the total re-projection error for all the
@@ -1409,7 +1410,9 @@ enum{
     CALIB_FIX_K3                = 1 << 6,
     CALIB_FIX_K4                = 1 << 7,
     CALIB_FIX_INTRINSIC         = 1 << 8,
-    CALIB_FIX_PRINCIPAL_POINT   = 1 << 9
+    CALIB_FIX_PRINCIPAL_POINT   = 1 << 9,
+    CALIB_ZERO_DISPARITY        = 1 << 10,
+    CALIB_FIX_FOCAL_LENGTH      = 1 << 11
 };
 
 /** @brief Projects points using fisheye model
@@ -1542,7 +1545,7 @@ objectPoints[i].size() for each i.
 @param image_size Size of the image used only to initialize the camera intrinsic matrix.
 @param K Output 3x3 floating-point camera intrinsic matrix
 \f$\cameramatrix{A}\f$ . If
-fisheye::CALIB_USE_INTRINSIC_GUESS/ is specified, some or all of fx, fy, cx, cy must be
+@ref fisheye::CALIB_USE_INTRINSIC_GUESS is specified, some or all of fx, fy, cx, cy must be
 initialized before calling the function.
 @param D Output vector of distortion coefficients \f$\distcoeffsfisheye\f$.
 @param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
@@ -1552,17 +1555,19 @@ space (in which object points are specified) to the world coordinate space, that
 position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
 @param tvecs Output vector of translation vectors estimated for each pattern view.
 @param flags Different flags that may be zero or a combination of the following values:
--   **fisheye::CALIB_USE_INTRINSIC_GUESS** cameraMatrix contains valid initial values of
+-   @ref fisheye::CALIB_USE_INTRINSIC_GUESS  cameraMatrix contains valid initial values of
 fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
 center ( imageSize is used), and focal distances are computed in a least-squares fashion.
--   **fisheye::CALIB_RECOMPUTE_EXTRINSIC** Extrinsic will be recomputed after each iteration
+-   @ref fisheye::CALIB_RECOMPUTE_EXTRINSIC  Extrinsic will be recomputed after each iteration
 of intrinsic optimization.
--   **fisheye::CALIB_CHECK_COND** The functions will check validity of condition number.
--   **fisheye::CALIB_FIX_SKEW** Skew coefficient (alpha) is set to zero and stay zero.
--   **fisheye::CALIB_FIX_K1..fisheye::CALIB_FIX_K4** Selected distortion coefficients
+-   @ref fisheye::CALIB_CHECK_COND  The functions will check validity of condition number.
+-   @ref fisheye::CALIB_FIX_SKEW  Skew coefficient (alpha) is set to zero and stay zero.
+-   @ref fisheye::CALIB_FIX_K1,..., @ref fisheye::CALIB_FIX_K4 Selected distortion coefficients
 are set to zeros and stay zero.
--   **fisheye::CALIB_FIX_PRINCIPAL_POINT** The principal point is not changed during the global
-optimization. It stays at the center or at a different location specified when CALIB_USE_INTRINSIC_GUESS is set too.
+-   @ref fisheye::CALIB_FIX_PRINCIPAL_POINT  The principal point is not changed during the global
+optimization. It stays at the center or at a different location specified when @ref fisheye::CALIB_USE_INTRINSIC_GUESS is set too.
+-   @ref fisheye::CALIB_FIX_FOCAL_LENGTH The focal length is not changed during the global
+optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \f$f_y\f$ when @ref fisheye::CALIB_USE_INTRINSIC_GUESS is set too.
 @param criteria Termination criteria for the iterative optimization algorithm.
  */
 CV_EXPORTS_W double calibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints, const Size& image_size,
@@ -1586,7 +1591,7 @@ camera.
 @param P2 Output 3x4 projection matrix in the new (rectified) coordinate systems for the second
 camera.
 @param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see reprojectImageTo3D ).
-@param flags Operation flags that may be zero or CALIB_ZERO_DISPARITY . If the flag is set,
+@param flags Operation flags that may be zero or @ref fisheye::CALIB_ZERO_DISPARITY . If the flag is set,
 the function makes the principal points of each camera have the same pixel coordinates in the
 rectified views. And if the flag is not set, the function may still shift the images in the
 horizontal or vertical direction (depending on the orientation of epipolar lines) to maximize the
@@ -1612,7 +1617,7 @@ observed by the first camera.
 observed by the second camera.
 @param K1 Input/output first camera intrinsic matrix:
 \f$\vecthreethree{f_x^{(j)}}{0}{c_x^{(j)}}{0}{f_y^{(j)}}{c_y^{(j)}}{0}{0}{1}\f$ , \f$j = 0,\, 1\f$ . If
-any of fisheye::CALIB_USE_INTRINSIC_GUESS , fisheye::CALIB_FIX_INTRINSIC are specified,
+any of @ref fisheye::CALIB_USE_INTRINSIC_GUESS , @ref fisheye::CALIB_FIX_INTRINSIC are specified,
 some or all of the matrix components must be initialized.
 @param D1 Input/output vector of distortion coefficients \f$\distcoeffsfisheye\f$ of 4 elements.
 @param K2 Input/output second camera intrinsic matrix. The parameter is similar to K1 .
@@ -1622,16 +1627,16 @@ similar to D1 .
 @param R Output rotation matrix between the 1st and the 2nd camera coordinate systems.
 @param T Output translation vector between the coordinate systems of the cameras.
 @param flags Different flags that may be zero or a combination of the following values:
--   **fisheye::CALIB_FIX_INTRINSIC** Fix K1, K2? and D1, D2? so that only R, T matrices
+-   @ref fisheye::CALIB_FIX_INTRINSIC  Fix K1, K2? and D1, D2? so that only R, T matrices
 are estimated.
--   **fisheye::CALIB_USE_INTRINSIC_GUESS** K1, K2 contains valid initial values of
+-   @ref fisheye::CALIB_USE_INTRINSIC_GUESS  K1, K2 contains valid initial values of
 fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
 center (imageSize is used), and focal distances are computed in a least-squares fashion.
--   **fisheye::CALIB_RECOMPUTE_EXTRINSIC** Extrinsic will be recomputed after each iteration
+-   @ref fisheye::CALIB_RECOMPUTE_EXTRINSIC  Extrinsic will be recomputed after each iteration
 of intrinsic optimization.
--   **fisheye::CALIB_CHECK_COND** The functions will check validity of condition number.
--   **fisheye::CALIB_FIX_SKEW** Skew coefficient (alpha) is set to zero and stay zero.
--   **fisheye::CALIB_FIX_K1..4** Selected distortion coefficients are set to zeros and stay
+-   @ref fisheye::CALIB_CHECK_COND  The functions will check validity of condition number.
+-   @ref fisheye::CALIB_FIX_SKEW  Skew coefficient (alpha) is set to zero and stay zero.
+-   @ref fisheye::CALIB_FIX_K1,..., @ref fisheye::CALIB_FIX_K4 Selected distortion coefficients are set to zeros and stay
 zero.
 @param criteria Termination criteria for the iterative optimization algorithm.
  */
diff --git a/modules/calib/src/calibinit.cpp b/modules/calib/src/calibinit.cpp
index b776a1b617..ac4fc23d03 100644
--- a/modules/calib/src/calibinit.cpp
+++ b/modules/calib/src/calibinit.cpp
@@ -1837,7 +1837,7 @@ void ChessBoardDetector::generateQuads(const Mat& image_, int flags)
         if (boardIdx != parentIdx && (boardIdx < 0 || contour_child_counter[boardIdx] < contour_child_counter[parentIdx]))
             boardIdx = parentIdx;
 
-        contour_quads.push_back(QuadCountour(pt, parentIdx));
+        contour_quads.emplace_back(pt, parentIdx);
     }
 
     size_t total = contour_quads.size();
@@ -2173,13 +2173,6 @@ void drawChessboardCorners( InputOutputArray image, Size patternSize,
     }
 }
 
-static int quiet_error(int /*status*/, const char* /*func_name*/,
-                       const char* /*err_msg*/, const char* /*file_name*/,
-                       int /*line*/, void* /*userdata*/)
-{
-    return 0;
-}
-
 bool findCirclesGrid( InputArray _image, Size patternSize,
                           OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector,
                           const CirclesGridFinderParameters& parameters_)
@@ -2192,15 +2185,22 @@ bool findCirclesGrid( InputArray _image, Size patternSize,
     bool isSymmetricGrid  = (flags & CALIB_CB_SYMMETRIC_GRID ) ? true : false;
     CV_Assert(isAsymmetricGrid ^ isSymmetricGrid);
 
-    Mat image = _image.getMat();
     std::vector<Point2f> centers;
 
-    std::vector<KeyPoint> keypoints;
-    blobDetector->detect(image, keypoints);
     std::vector<Point2f> points;
-    for (size_t i = 0; i < keypoints.size(); i++)
+    if (blobDetector)
     {
-      points.push_back (keypoints[i].pt);
+        std::vector<KeyPoint> keypoints;
+        blobDetector->detect(_image, keypoints);
+        for (size_t i = 0; i < keypoints.size(); i++)
+        {
+            points.push_back(keypoints[i].pt);
+        }
+    }
+    else
+    {
+        CV_CheckTypeEQ(_image.type(), CV_32FC2, "blobDetector must be provided or image must contains Point2f array (std::vector<Point2f>) with candidates");
+        _image.copyTo(points);
     }
 
     if(flags & CALIB_CB_ASYMMETRIC_GRID)
@@ -2216,64 +2216,59 @@ bool findCirclesGrid( InputArray _image, Size patternSize,
       return !centers.empty();
     }
 
+    bool isValid = false;
     const int attempts = 2;
     const size_t minHomographyPoints = 4;
     Mat H;
     for (int i = 0; i < attempts; i++)
     {
-      centers.clear();
-      CirclesGridFinder boxFinder(patternSize, points, parameters);
-      bool isFound = false;
-#define BE_QUIET 1
-#if BE_QUIET
-      void* oldCbkData;
-      ErrorCallback oldCbk = redirectError(quiet_error, 0, &oldCbkData); // FIXIT not thread safe
-#endif
-      try
-      {
-        isFound = boxFinder.findHoles();
-      }
-      catch (const Exception &)
-      {
-
-      }
-#if BE_QUIET
-      redirectError(oldCbk, oldCbkData);
-#endif
-      if (isFound)
-      {
-        switch(parameters.gridType)
+        centers.clear();
+        CirclesGridFinder boxFinder(patternSize, points, parameters);
+        try
         {
-          case CirclesGridFinderParameters::SYMMETRIC_GRID:
-            boxFinder.getHoles(centers);
-            break;
-          case CirclesGridFinderParameters::ASYMMETRIC_GRID:
-        boxFinder.getAsymmetricHoles(centers);
-        break;
-          default:
-            CV_Error(Error::StsBadArg, "Unknown pattern type");
+            bool isFound = boxFinder.findHoles();
+            if (isFound)
+            {
+                switch(parameters.gridType)
+                {
+                case CirclesGridFinderParameters::SYMMETRIC_GRID:
+                    boxFinder.getHoles(centers);
+                    break;
+                case CirclesGridFinderParameters::ASYMMETRIC_GRID:
+                    boxFinder.getAsymmetricHoles(centers);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg, "Unknown pattern type");
+                }
+
+                isValid = true;
+                break;  // done, return result
+            }
+        }
+        catch (const cv::Exception& e)
+        {
+            CV_UNUSED(e);
+            CV_LOG_DEBUG(NULL, "findCirclesGrid2: attempt=" << i << ": " << e.what());
+            // nothing, next attempt
         }
 
-        if (i != 0)
+        boxFinder.getHoles(centers);
+        if (i != attempts - 1)
         {
-          Mat orgPointsMat;
-          transform(centers, orgPointsMat, H.inv());
-          convertPointsFromHomogeneous(orgPointsMat, centers);
+            if (centers.size() < minHomographyPoints)
+                break;
+            H = CirclesGridFinder::rectifyGrid(boxFinder.getDetectedGridSize(), centers, points, points);
         }
-        Mat(centers).copyTo(_centers);
-        return true;
-      }
+    }
 
-      boxFinder.getHoles(centers);
-      if (i != attempts - 1)
-      {
-        if (centers.size() < minHomographyPoints)
-          break;
-        H = CirclesGridFinder::rectifyGrid(boxFinder.getDetectedGridSize(), centers, points, points);
-      }
+    if (!centers.empty() && !H.empty())  // undone rectification
+    {
+        Mat orgPointsMat;
+        transform(centers, orgPointsMat, H.inv());
+        convertPointsFromHomogeneous(orgPointsMat, centers);
     }
     Mat(centers).copyTo(_centers);
-    return false;
+    return isValid;
 }
 
 bool findCirclesGrid(InputArray _image, Size patternSize,
diff --git a/modules/calib/src/checkchessboard.cpp b/modules/calib/src/checkchessboard.cpp
index 987790eedb..47995297c2 100644
--- a/modules/calib/src/checkchessboard.cpp
+++ b/modules/calib/src/checkchessboard.cpp
@@ -76,7 +76,7 @@ static void icvGetQuadrangleHypotheses(const std::vector<std::vector< cv::Point
             continue;
         }
 
-        quads.push_back(std::pair<float, int>(box_size, class_id));
+        quads.emplace_back(box_size, class_id);
     }
 }
 
diff --git a/modules/calib/src/circlesgrid.cpp b/modules/calib/src/circlesgrid.cpp
index d60069d1c1..90c123e140 100644
--- a/modules/calib/src/circlesgrid.cpp
+++ b/modules/calib/src/circlesgrid.cpp
@@ -384,15 +384,15 @@ void CirclesGridClusterFinder::rectifyPatternPoints(const std::vector<cv::Point2
 {
   //indices of corner points in pattern
   std::vector<Point> trueIndices;
-  trueIndices.push_back(Point(0, 0));
-  trueIndices.push_back(Point(patternSize.width - 1, 0));
+  trueIndices.emplace_back(0, 0);
+  trueIndices.emplace_back(patternSize.width - 1, 0);
   if(isAsymmetricGrid)
   {
-    trueIndices.push_back(Point(patternSize.width - 1, 1));
-    trueIndices.push_back(Point(patternSize.width - 1, patternSize.height - 2));
+    trueIndices.emplace_back(patternSize.width - 1, 1);
+    trueIndices.emplace_back(patternSize.width - 1, patternSize.height - 2);
   }
-  trueIndices.push_back(Point(patternSize.width - 1, patternSize.height - 1));
-  trueIndices.push_back(Point(0, patternSize.height - 1));
+  trueIndices.emplace_back(patternSize.width - 1, patternSize.height - 1);
+  trueIndices.emplace_back(0, patternSize.height - 1);
 
   std::vector<Point2f> idealPoints;
   for(size_t idx=0; idx<trueIndices.size(); idx++)
@@ -401,11 +401,11 @@ void CirclesGridClusterFinder::rectifyPatternPoints(const std::vector<cv::Point2
     int j = trueIndices[idx].x;
     if(isAsymmetricGrid)
     {
-      idealPoints.push_back(Point2f((2*j + i % 2)*squareSize, i*squareSize));
+      idealPoints.emplace_back((2*j + i % 2)*squareSize, i*squareSize);
     }
     else
     {
-      idealPoints.push_back(Point2f(j*squareSize, i*squareSize));
+      idealPoints.emplace_back(j*squareSize, i*squareSize);
     }
   }
 
@@ -479,7 +479,7 @@ void Graph::addVertex(size_t id)
 {
   CV_Assert( !doesVertexExist( id ) );
 
-  vertices.insert(std::pair<size_t, Vertex> (id, Vertex()));
+  vertices.emplace(id, Vertex());
 }
 
 void Graph::addEdge(size_t id1, size_t id2)
@@ -889,10 +889,9 @@ Mat CirclesGridFinder::rectifyGrid(Size detectedGridSize, const std::vector<Poin
   convertPointsFromHomogeneous(dstKeypointsMat, dstKeypoints);
 
   warpedKeypoints.clear();
-  for (size_t i = 0; i < dstKeypoints.size(); i++)
+  for (auto &pt:dstKeypoints)
   {
-    Point2f pt = dstKeypoints[i];
-    warpedKeypoints.push_back(pt);
+    warpedKeypoints.emplace_back(std::move(pt));
   }
 
   return H;
@@ -1528,35 +1527,35 @@ void CirclesGridFinder::getCornerSegments(const std::vector<std::vector<size_t>
 
   //all 8 segments with one end in a corner
   std::vector<Segment> corner;
-  corner.push_back(Segment(keypoints[points[1][0]], keypoints[points[0][0]]));
-  corner.push_back(Segment(keypoints[points[0][0]], keypoints[points[0][1]]));
+  corner.emplace_back(keypoints[points[1][0]], keypoints[points[0][0]]);
+  corner.emplace_back(keypoints[points[0][0]], keypoints[points[0][1]]);
   segments.push_back(corner);
-  cornerIndices.push_back(Point(0, 0));
-  firstSteps.push_back(Point(1, 0));
-  secondSteps.push_back(Point(0, 1));
+  cornerIndices.emplace_back(0, 0);
+  firstSteps.emplace_back(1, 0);
+  secondSteps.emplace_back(0, 1);
   corner.clear();
 
-  corner.push_back(Segment(keypoints[points[0][w - 2]], keypoints[points[0][w - 1]]));
-  corner.push_back(Segment(keypoints[points[0][w - 1]], keypoints[points[1][w - 1]]));
+  corner.emplace_back(keypoints[points[0][w - 2]], keypoints[points[0][w - 1]]);
+  corner.emplace_back(keypoints[points[0][w - 1]], keypoints[points[1][w - 1]]);
   segments.push_back(corner);
-  cornerIndices.push_back(Point(w - 1, 0));
-  firstSteps.push_back(Point(0, 1));
-  secondSteps.push_back(Point(-1, 0));
+  cornerIndices.emplace_back(w - 1, 0);
+  firstSteps.emplace_back(0, 1);
+  secondSteps.emplace_back(-1, 0);
   corner.clear();
 
-  corner.push_back(Segment(keypoints[points[h - 2][w - 1]], keypoints[points[h - 1][w - 1]]));
-  corner.push_back(Segment(keypoints[points[h - 1][w - 1]], keypoints[points[h - 1][w - 2]]));
+  corner.emplace_back(keypoints[points[h - 2][w - 1]], keypoints[points[h - 1][w - 1]]);
+  corner.emplace_back(keypoints[points[h - 1][w - 1]], keypoints[points[h - 1][w - 2]]);
   segments.push_back(corner);
-  cornerIndices.push_back(Point(w - 1, h - 1));
-  firstSteps.push_back(Point(-1, 0));
-  secondSteps.push_back(Point(0, -1));
+  cornerIndices.emplace_back(w - 1, h - 1);
+  firstSteps.emplace_back(-1, 0);
+  secondSteps.emplace_back(0, -1);
   corner.clear();
 
-  corner.push_back(Segment(keypoints[points[h - 1][1]], keypoints[points[h - 1][0]]));
-  corner.push_back(Segment(keypoints[points[h - 1][0]], keypoints[points[h - 2][0]]));
-  cornerIndices.push_back(Point(0, h - 1));
-  firstSteps.push_back(Point(0, -1));
-  secondSteps.push_back(Point(1, 0));
+  corner.emplace_back(keypoints[points[h - 1][1]], keypoints[points[h - 1][0]]);
+  corner.emplace_back(keypoints[points[h - 1][0]], keypoints[points[h - 2][0]]);
+  cornerIndices.emplace_back(0, h - 1);
+  firstSteps.emplace_back(0, -1);
+  secondSteps.emplace_back(1, 0);
   segments.push_back(corner);
   corner.clear();
 
@@ -1616,7 +1615,7 @@ size_t CirclesGridFinder::getFirstCorner(std::vector<Point> &largeCornerIndices,
   int cornerIdx = 0;
   bool waitOutsider = true;
 
-  for(;;)
+  for (size_t i = 0; i < cornersCount * 2; ++i)
   {
     if (waitOutsider)
     {
@@ -1626,13 +1625,13 @@ size_t CirclesGridFinder::getFirstCorner(std::vector<Point> &largeCornerIndices,
     else
     {
       if (isInsider[(cornerIdx + 1) % cornersCount])
-        break;
+        return cornerIdx;
     }
 
     cornerIdx = (cornerIdx + 1) % cornersCount;
   }
 
-  return cornerIdx;
+  CV_Error(Error::StsNoConv, "isInsider array has the same values");
 }
 
 }
diff --git a/modules/calib/src/fisheye.cpp b/modules/calib/src/fisheye.cpp
index 5874b7faa7..2d79b8fa94 100644
--- a/modules/calib/src/fisheye.cpp
+++ b/modules/calib/src/fisheye.cpp
@@ -756,8 +756,8 @@ double cv::fisheye::calibrate(InputArrayOfArrays objectPoints, InputArrayOfArray
     IntrinsicParams currentParam;
     IntrinsicParams errors;
 
-    finalParam.isEstimate[0] = 1;
-    finalParam.isEstimate[1] = 1;
+    finalParam.isEstimate[0] = flags & CALIB_FIX_FOCAL_LENGTH ? 0 : 1;
+    finalParam.isEstimate[1] = flags & CALIB_FIX_FOCAL_LENGTH ? 0 : 1;
     finalParam.isEstimate[2] = flags & CALIB_FIX_PRINCIPAL_POINT ? 0 : 1;
     finalParam.isEstimate[3] = flags & CALIB_FIX_PRINCIPAL_POINT ? 0 : 1;
     finalParam.isEstimate[4] = flags & CALIB_FIX_SKEW ? 0 : 1;
diff --git a/modules/calib/test/test_chesscorners.cpp b/modules/calib/test/test_chesscorners.cpp
index 3d730f6bfd..b4d0628c87 100644
--- a/modules/calib/test/test_chesscorners.cpp
+++ b/modules/calib/test/test_chesscorners.cpp
@@ -656,5 +656,99 @@ TEST(Calib3d_CirclesPatternDetectorWithClustering, accuracy)
     ASSERT_LE(error, precise_success_error_level);
 }
 
+TEST(Calib3d_AsymmetricCirclesPatternDetector, regression_18713)
+{
+    float pts_[][2] = {
+        { 166.5, 107 }, { 146, 236 }, { 147, 92 }, { 184, 162 }, { 150, 185.5 },
+        { 215, 105 }, { 270.5, 186 }, { 159, 142 }, { 6, 205.5 }, { 32, 148.5 },
+        { 126, 163.5 }, { 181, 208.5 }, { 240.5, 62 }, { 84.5, 76.5 }, { 190, 120.5 },
+        { 10, 189 }, { 266, 104 }, { 307.5, 207.5 }, { 97, 184 }, { 116.5, 210 },
+        { 114, 139 }, { 84.5, 233 }, { 269.5, 139 }, { 136, 126.5 }, { 120, 107.5 },
+        { 129.5, 65.5 }, { 212.5, 140.5 }, { 204.5, 60.5 }, { 207.5, 241 }, { 61.5, 94.5 },
+        { 186.5, 61.5 }, { 220, 63 }, { 239, 120.5 }, { 212, 186 }, { 284, 87.5 },
+        { 62, 114.5 }, { 283, 61.5 }, { 238.5, 88.5 }, { 243, 159 }, { 245, 208 },
+        { 298.5, 158.5 }, { 57, 129 }, { 156.5, 63.5 }, { 192, 90.5 }, { 281, 235.5 },
+        { 172, 62.5 }, { 291.5, 119.5 }, { 90, 127 }, { 68.5, 166.5 }, { 108.5, 83.5 },
+        { 22, 176 }
+    };
+    Mat candidates(51, 1, CV_32FC2, (void*)pts_);
+    Size patternSize(4, 9);
+
+    std::vector< Point2f > result;
+    bool res = false;
+
+    // issue reports about hangs
+    EXPECT_NO_THROW(res = findCirclesGrid(candidates, patternSize, result, CALIB_CB_ASYMMETRIC_GRID, Ptr<FeatureDetector>()/*blobDetector=NULL*/));
+    EXPECT_FALSE(res);
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << Mat(candidates) << std::endl;
+        std::cout << Mat(result) << std::endl;
+        Mat img(Size(400, 300), CV_8UC3, Scalar::all(0));
+
+        std::vector< Point2f > centers;
+        candidates.copyTo(centers);
+
+        for (size_t i = 0; i < centers.size(); i++)
+        {
+            const Point2f& pt = centers[i];
+            //printf("{ %g, %g }, \n", pt.x, pt.y);
+            circle(img, pt, 5, Scalar(0, 255, 0));
+        }
+        for (size_t i = 0; i < result.size(); i++)
+        {
+            const Point2f& pt = result[i];
+            circle(img, pt, 10, Scalar(0, 0, 255));
+        }
+        imwrite("test_18713.png", img);
+        if (cvtest::debugLevel >= 10)
+        {
+            imshow("result", img);
+            waitKey();
+        }
+    }
+}
+
+TEST(Calib3d_AsymmetricCirclesPatternDetector, regression_19498)
+{
+    float pts_[121][2] = {
+        { 84.7462f, 404.504f }, { 49.1586f, 404.092f }, { 12.3362f, 403.434f }, { 102.542f, 386.214f }, { 67.6042f, 385.475f },
+        { 31.4982f, 384.569f }, { 141.231f, 377.856f }, { 332.834f, 370.745f }, { 85.7663f, 367.261f }, { 50.346f, 366.051f },
+        { 13.7726f, 364.663f }, { 371.746f, 362.011f }, { 68.8543f, 347.883f }, { 32.9334f, 346.263f }, { 331.926f, 343.291f },
+        { 351.535f, 338.112f }, { 51.7951f, 328.247f }, { 15.4613f, 326.095f }, { 311.719f, 319.578f }, { 330.947f, 313.708f },
+        { 256.706f, 307.584f }, { 34.6834f, 308.167f }, { 291.085f, 295.429f }, { 17.4316f, 287.824f }, { 252.928f, 277.92f },
+        { 270.19f, 270.93f }, { 288.473f, 263.484f }, { 216.401f, 260.94f }, { 232.195f, 253.656f }, { 266.757f, 237.708f },
+        { 211.323f, 229.005f }, { 227.592f, 220.498f }, { 154.749f, 188.52f }, { 222.52f, 184.906f }, { 133.85f, 163.968f },
+        { 200.024f, 158.05f }, { 147.485f, 153.643f }, { 161.967f, 142.633f }, { 177.396f, 131.059f }, { 125.909f, 128.116f },
+        { 139.817f, 116.333f }, { 91.8639f, 114.454f }, { 104.343f, 102.542f }, { 117.635f, 89.9116f }, { 70.9465f, 89.4619f },
+        { 82.8524f, 76.7862f }, { 131.738f, 76.4741f }, { 95.5012f, 63.3351f }, { 109.034f, 49.0424f }, { 314.886f, 374.711f },
+        { 351.735f, 366.489f }, { 279.113f, 357.05f }, { 313.371f, 348.131f }, { 260.123f, 335.271f }, { 276.346f, 330.325f },
+        { 293.588f, 325.133f }, { 240.86f, 313.143f }, { 273.436f, 301.667f }, { 206.762f, 296.574f }, { 309.877f, 288.796f },
+        { 187.46f, 274.319f }, { 201.521f, 267.804f }, { 248.973f, 245.918f }, { 181.644f, 244.655f }, { 196.025f, 237.045f },
+        { 148.41f, 229.131f }, { 161.604f, 221.215f }, { 175.455f, 212.873f }, { 244.748f, 211.459f }, { 128.661f, 206.109f },
+        { 190.217f, 204.108f }, { 141.346f, 197.568f }, { 205.876f, 194.781f }, { 168.937f, 178.948f }, { 121.006f, 173.714f },
+        { 183.998f, 168.806f }, { 88.9095f, 159.731f }, { 100.559f, 149.867f }, { 58.553f, 146.47f }, { 112.849f, 139.302f },
+        { 80.0968f, 125.74f }, { 39.24f, 123.671f }, { 154.582f, 103.85f }, { 59.7699f, 101.49f }, { 266.334f, 385.387f },
+        { 234.053f, 368.718f }, { 263.347f, 361.184f }, { 244.763f, 339.958f }, { 198.16f, 328.214f }, { 211.675f, 323.407f },
+        { 225.905f, 318.426f }, { 192.98f, 302.119f }, { 221.267f, 290.693f }, { 161.437f, 286.46f }, { 236.656f, 284.476f },
+        { 168.023f, 251.799f }, { 105.385f, 221.988f }, { 116.724f, 214.25f }, { 97.2959f, 191.81f }, { 108.89f, 183.05f },
+        { 77.9896f, 169.242f }, { 48.6763f, 156.088f }, { 68.9635f, 136.415f }, { 29.8484f, 133.886f }, { 49.1966f, 112.826f },
+        { 113.059f, 29.003f }, { 251.698f, 388.562f }, { 281.689f, 381.929f }, { 297.875f, 378.518f }, { 248.376f, 365.025f },
+        { 295.791f, 352.763f }, { 216.176f, 348.586f }, { 230.143f, 344.443f }, { 179.89f, 307.457f }, { 174.083f, 280.51f },
+        { 142.867f, 265.085f }, { 155.127f, 258.692f }, { 124.187f, 243.661f }, { 136.01f, 236.553f }, { 86.4651f, 200.13f },
+        { 67.5711f, 178.221f }
+    };
+
+    Mat candidates(121, 1, CV_32FC2, (void*)pts_);
+    Size patternSize(13, 8);
+
+    std::vector< Point2f > result;
+    bool res = false;
+
+    EXPECT_NO_THROW(res = findCirclesGrid(candidates, patternSize, result, CALIB_CB_SYMMETRIC_GRID, Ptr<FeatureDetector>()/*blobDetector=NULL*/));
+    EXPECT_FALSE(res);
+}
+
 }} // namespace
 /* End of file. */
diff --git a/modules/calib/test/test_cornerssubpix.cpp b/modules/calib/test/test_cornerssubpix.cpp
index 05b75c5cbc..b70cc1e988 100644
--- a/modules/calib/test/test_cornerssubpix.cpp
+++ b/modules/calib/test/test_cornerssubpix.cpp
@@ -153,9 +153,8 @@ void CV_ChessboardSubpixelTest::run( int )
 
         vector<Point2f> test_corners;
         bool result = findChessboardCorners(chessboard_image, pattern_size, test_corners, 15);
-        if(!result)
+        if (!result && cvtest::debugLevel > 0)
         {
-#if 0
             ts->printf(cvtest::TS::LOG, "Warning: chessboard was not detected! Writing image to test.png\n");
             ts->printf(cvtest::TS::LOG, "Size = %d, %d\n", pattern_size.width, pattern_size.height);
             ts->printf(cvtest::TS::LOG, "Intrinsic params: fx = %f, fy = %f, cx = %f, cy = %f\n",
@@ -167,7 +166,9 @@ void CV_ChessboardSubpixelTest::run( int )
                        distortion_coeffs_.at<double>(0, 4));
 
             imwrite("test.png", chessboard_image);
-#endif
+        }
+        if (!result)
+        {
             continue;
         }
 
diff --git a/modules/calib/test/test_fisheye.cpp b/modules/calib/test/test_fisheye.cpp
index 5acc5cafaa..310804d233 100644
--- a/modules/calib/test/test_fisheye.cpp
+++ b/modules/calib/test/test_fisheye.cpp
@@ -345,7 +345,7 @@ TEST_F(fisheyeTest, Calibration)
     std::vector<std::vector<cv::Point2d> > imagePoints(n_images);
     std::vector<std::vector<cv::Point3d> > objectPoints(n_images);
 
-    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    const std::string folder = combine(datasets_repository_path, "calib-3_stereo_from_JY");
     cv::FileStorage fs_left(combine(folder, "left.xml"), cv::FileStorage::READ);
     CV_Assert(fs_left.isOpened());
     for(int i = 0; i < n_images; ++i)
@@ -373,6 +373,53 @@ TEST_F(fisheyeTest, Calibration)
     EXPECT_MAT_NEAR(theD, this->D, 1e-10);
 }
 
+TEST_F(fisheyeTest, CalibrationWithFixedFocalLength)
+{
+    const int n_images = 34;
+
+    std::vector<std::vector<cv::Point2d> > imagePoints(n_images);
+    std::vector<std::vector<cv::Point3d> > objectPoints(n_images);
+
+    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    cv::FileStorage fs_left(combine(folder, "left.xml"), cv::FileStorage::READ);
+    CV_Assert(fs_left.isOpened());
+    for(int i = 0; i < n_images; ++i)
+        fs_left[cv::format("image_%d", i )] >> imagePoints[i];
+    fs_left.release();
+
+    cv::FileStorage fs_object(combine(folder, "object.xml"), cv::FileStorage::READ);
+    CV_Assert(fs_object.isOpened());
+    for(int i = 0; i < n_images; ++i)
+        fs_object[cv::format("image_%d", i )] >> objectPoints[i];
+    fs_object.release();
+
+    int flag = 0;
+    flag |= cv::fisheye::CALIB_RECOMPUTE_EXTRINSIC;
+    flag |= cv::fisheye::CALIB_CHECK_COND;
+    flag |= cv::fisheye::CALIB_FIX_SKEW;
+    flag |= cv::fisheye::CALIB_FIX_FOCAL_LENGTH;
+    flag |= cv::fisheye::CALIB_USE_INTRINSIC_GUESS;
+
+    cv::Matx33d theK = this->K;
+    const cv::Matx33d newK(
+        558.478088, 0.000000, 620.458461,
+        0.000000, 560.506767, 381.939362,
+        0.000000, 0.000000, 1.000000);
+
+    cv::Vec4d theD;
+    const cv::Vec4d newD(-0.001461, -0.003298, 0.006057, -0.003742);
+
+    cv::fisheye::calibrate(objectPoints, imagePoints, imageSize, theK, theD,
+                           cv::noArray(), cv::noArray(), flag, cv::TermCriteria(3, 20, 1e-6));
+
+    // ensure that CALIB_FIX_FOCAL_LENGTH works and focal lenght has not changed
+    EXPECT_EQ(theK(0,0), K(0,0));
+    EXPECT_EQ(theK(1,1), K(1,1));
+
+    EXPECT_MAT_NEAR(theK, newK, 1e-6);
+    EXPECT_MAT_NEAR(theD, newD, 1e-6);
+}
+
 TEST_F(fisheyeTest, Homography)
 {
     const int n_images = 1;
@@ -380,7 +427,7 @@ TEST_F(fisheyeTest, Homography)
     std::vector<std::vector<cv::Point2d> > imagePoints(n_images);
     std::vector<std::vector<cv::Point3d> > objectPoints(n_images);
 
-    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    const std::string folder = combine(datasets_repository_path, "calib-3_stereo_from_JY");
     cv::FileStorage fs_left(combine(folder, "left.xml"), cv::FileStorage::READ);
     CV_Assert(fs_left.isOpened());
     for(int i = 0; i < n_images; ++i)
@@ -492,7 +539,13 @@ TEST_F(fisheyeTest, EstimateUncertainties)
 
 TEST_F(fisheyeTest, stereoRectify)
 {
-    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    // For consistency purposes
+    CV_StaticAssert(
+        static_cast<int>(cv::CALIB_ZERO_DISPARITY) == static_cast<int>(cv::fisheye::CALIB_ZERO_DISPARITY),
+        "For the purpose of continuity the following should be true: cv::CALIB_ZERO_DISPARITY == cv::fisheye::CALIB_ZERO_DISPARITY"
+    );
+
+    const std::string folder = combine(datasets_repository_path, "calib-3_stereo_from_JY");
 
     cv::Size calibration_size = this->imageSize, requested_size = calibration_size;
     cv::Matx33d K1 = this->K, K2 = K1;
@@ -504,7 +557,7 @@ TEST_F(fisheyeTest, stereoRectify)
     double balance = 0.0, fov_scale = 1.1;
     cv::Mat R1, R2, P1, P2, Q;
     cv::fisheye::stereoRectify(K1, D1, K2, D2, calibration_size, theR, theT, R1, R2, P1, P2, Q,
-                      cv::CALIB_ZERO_DISPARITY, requested_size, balance, fov_scale);
+                      cv::fisheye::CALIB_ZERO_DISPARITY, requested_size, balance, fov_scale);
 
     // Collected with these CMake flags: -DWITH_IPP=OFF -DCV_ENABLE_INTRINSICS=OFF -DCV_DISABLE_OPTIMIZATION=ON -DCMAKE_BUILD_TYPE=Debug
     cv::Matx33d R1_ref(
@@ -551,7 +604,10 @@ TEST_F(fisheyeTest, stereoRectify)
             << "Q =" << std::endl << Q << std::endl;
     }
 
-#if 1 // Debug code
+    if (cvtest::debugLevel == 0)
+        return;
+    // DEBUG code is below
+
     cv::Mat lmapx, lmapy, rmapx, rmapy;
     //rewrite for fisheye
     cv::fisheye::initUndistortRectifyMap(K1, D1, R1, P1, requested_size, CV_32F, lmapx, lmapy);
@@ -584,14 +640,13 @@ TEST_F(fisheyeTest, stereoRectify)
 
         cv::imwrite(cv::format("fisheye_rectification_AB_%03d.png", i), rectification);
     }
-#endif
 }
 
 TEST_F(fisheyeTest, stereoCalibrate)
 {
     const int n_images = 34;
 
-    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    const std::string folder = combine(datasets_repository_path, "calib-3_stereo_from_JY");
 
     std::vector<std::vector<cv::Point2d> > leftPoints(n_images);
     std::vector<std::vector<cv::Point2d> > rightPoints(n_images);
@@ -658,7 +713,7 @@ TEST_F(fisheyeTest, stereoCalibrateFixIntrinsic)
 {
     const int n_images = 34;
 
-    const std::string folder =combine(datasets_repository_path, "calib-3_stereo_from_JY");
+    const std::string folder = combine(datasets_repository_path, "calib-3_stereo_from_JY");
 
     std::vector<std::vector<cv::Point2d> > leftPoints(n_images);
     std::vector<std::vector<cv::Point2d> > rightPoints(n_images);
@@ -814,6 +869,7 @@ const cv::Matx33d fisheyeTest::K(558.478087865323,               0, 620.45851536
 
 const cv::Vec4d fisheyeTest::D(-0.0014613319981768, -0.00329861110580401, 0.00605760088590183, -0.00374209380722371);
 
+
 const cv::Matx33d fisheyeTest::R ( 9.9756700084424932e-01, 6.9698277640183867e-02, 1.4929569991321144e-03,
                             -6.9711825162322980e-02, 9.9748249845531767e-01, 1.2997180766418455e-02,
                             -5.8331736398316541e-04,-1.3069635393884985e-02, 9.9991441852366736e-01);
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 73e7f1d7bb..b2797ab31f 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -17,6 +17,18 @@ ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE
 ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
 ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
 
+
+set(PARALLEL_ENABLE_PLUGINS_DEFAULT ON)
+if(EMSCRIPTEN OR IOS OR WINRT)
+  set(PARALLEL_ENABLE_PLUGINS_DEFAULT OFF)
+endif()
+# parallel backends configuration
+set(PARALLEL_ENABLE_PLUGINS "${PARALLEL_ENABLE_PLUGINS_DEFAULT}" CACHE BOOL "Allow building parallel plugin support")
+# TODO building plugins with OpenCV is not supported yet
+#set(PARALLEL_PLUGIN_LIST "" CACHE STRING "List of parallel backends to be compiled as plugins (tbb, openmp or special value 'all')")
+#string(REPLACE "," ";" PARALLEL_PLUGIN_LIST "${PARALLEL_PLUGIN_LIST}")  # support comma-separated list (,) too
+
+
 ocv_add_module(core
                OPTIONAL opencv_cudev
                WRAP java objc python js)
@@ -58,10 +70,15 @@ file(GLOB_RECURSE module_opencl_hdrs
 source_group("Include\\Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Include\\Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
+file(GLOB_RECURSE core_parallel_hdrs
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/parallel/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/parallel/*.h")
+ocv_source_group("Include" DIRBASE "${CMAKE_CURRENT_LIST_DIR}/include" FILES ${core_parallel_hdrs})
+
 source_group("Src" FILES "${OPENCV_MODULE_opencv_core_BINARY_DIR}/version_string.inc")
 
 ocv_glob_module_sources(SOURCES "${OPENCV_MODULE_opencv_core_BINARY_DIR}/version_string.inc"
-                        HEADERS ${module_opencl_hdrs} ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+                        HEADERS ${core_parallel_hdrs} ${module_opencl_hdrs} ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 
 ocv_module_include_directories(${the_module} ${ZLIB_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS})
 if(ANDROID AND HAVE_CPUFEATURES)
@@ -80,24 +97,46 @@ endif()
 if(HAVE_MEMALIGN)
   ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/alloc.cpp "HAVE_MEMALIGN=1")
 endif()
+if(HAVE_WIN32_ALIGNED_MALLOC)
+  ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/alloc.cpp "HAVE_WIN32_ALIGNED_MALLOC=1")
+endif()
 if(HAVE_VA_INTEL_OLD_HEADER)
   ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/va_intel.cpp" "HAVE_VA_INTEL_OLD_HEADER")
 endif()
+if(OPENCV_LIBVA_LINK)
+  ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/va_intel.cpp" "OPENCV_LIBVA_LINK=1")
+endif()
 
 option(OPENCV_ENABLE_ALLOCATOR_STATS "Enable Allocator metrics" ON)
 
 if(NOT OPENCV_ENABLE_ALLOCATOR_STATS)
   add_definitions(-DOPENCV_DISABLE_ALLOCATOR_STATS=1)
-else()
+elseif(HAVE_CXX11 OR DEFINED OPENCV_ALLOCATOR_STATS_COUNTER_TYPE)
   if(NOT DEFINED OPENCV_ALLOCATOR_STATS_COUNTER_TYPE)
     if(HAVE_ATOMIC_LONG_LONG AND OPENCV_ENABLE_ATOMIC_LONG_LONG)
-      set(OPENCV_ALLOCATOR_STATS_COUNTER_TYPE "long long")
+      if(MINGW)
+        # command-line generation issue due to space in value, int/int64_t should be used instead
+        # https://github.com/opencv/opencv/issues/16990
+        message(STATUS "Consider adding OPENCV_ALLOCATOR_STATS_COUNTER_TYPE=int/int64_t according to your build configuration")
+      else()
+        set(OPENCV_ALLOCATOR_STATS_COUNTER_TYPE "long long")
+      endif()
     else()
       set(OPENCV_ALLOCATOR_STATS_COUNTER_TYPE "int")
     endif()
   endif()
-  message(STATUS "Allocator metrics storage type: '${OPENCV_ALLOCATOR_STATS_COUNTER_TYPE}'")
-  add_definitions("-DOPENCV_ALLOCATOR_STATS_COUNTER_TYPE=${OPENCV_ALLOCATOR_STATS_COUNTER_TYPE}")
+  if(DEFINED OPENCV_ALLOCATOR_STATS_COUNTER_TYPE)
+    message(STATUS "Allocator metrics storage type: '${OPENCV_ALLOCATOR_STATS_COUNTER_TYPE}'")
+    add_definitions("-DOPENCV_ALLOCATOR_STATS_COUNTER_TYPE=${OPENCV_ALLOCATOR_STATS_COUNTER_TYPE}")
+  endif()
+endif()
+
+
+if(PARALLEL_ENABLE_PLUGINS)
+  ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/parallel/parallel.cpp "PARALLEL_ENABLE_PLUGINS=1")
+  if(OPENCV_DEBUG_POSTFIX)
+    ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/parallel/parallel.cpp" "DEBUG_POSTFIX=${OPENCV_DEBUG_POSTFIX}")
+  endif()
 endif()
 
 
@@ -110,6 +149,10 @@ ocv_target_link_libraries(${the_module} PRIVATE
     "${OPENCV_HAL_LINKER_LIBS}"
 )
 
+if(OPENCV_CORE_EXCLUDE_C_API)
+  ocv_target_compile_definitions(${the_module} PRIVATE "OPENCV_EXCLUDE_C_API=1")
+endif()
+
 if(HAVE_HPX)
   ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
 endif()
diff --git a/modules/core/cmake/parallel/detect_openmp.cmake b/modules/core/cmake/parallel/detect_openmp.cmake
new file mode 100644
index 0000000000..39c050c78d
--- /dev/null
+++ b/modules/core/cmake/parallel/detect_openmp.cmake
@@ -0,0 +1,13 @@
+if(CMAKE_VERSION VERSION_LESS "3.9")
+  message(STATUS "OpenMP detection requires CMake 3.9+")  # OpenMP::OpenMP_CXX target
+endif()
+
+find_package(OpenMP)
+if(OpenMP_FOUND)
+  if(TARGET OpenMP::OpenMP_CXX)
+    set(HAVE_OPENMP 1)
+    ocv_add_external_target(openmp "" "OpenMP::OpenMP_CXX" "HAVE_OPENMP=1")
+  else()
+    message(WARNING "OpenMP: missing OpenMP::OpenMP_CXX target")
+  endif()
+endif()
diff --git a/modules/core/cmake/parallel/detect_tbb.cmake b/modules/core/cmake/parallel/detect_tbb.cmake
new file mode 100644
index 0000000000..93059f8f67
--- /dev/null
+++ b/modules/core/cmake/parallel/detect_tbb.cmake
@@ -0,0 +1,5 @@
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectTBB.cmake")
+
+if(HAVE_TBB)
+  ocv_add_external_target(tbb "" "tbb" "HAVE_TBB=1")
+endif()
diff --git a/modules/core/cmake/parallel/init.cmake b/modules/core/cmake/parallel/init.cmake
new file mode 100644
index 0000000000..c1bbe99f18
--- /dev/null
+++ b/modules/core/cmake/parallel/init.cmake
@@ -0,0 +1,8 @@
+macro(ocv_add_core_parallel_backend backend_id cond_var)
+  if(${cond_var})
+    include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+  endif()
+endmacro()
+
+ocv_add_core_parallel_backend("tbb" WITH_TBB)
+ocv_add_core_parallel_backend("openmp" WITH_OPENMP)
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 50af505968..48023844a9 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -50,7 +50,6 @@
 #endif
 
 #include "opencv2/core/cvdef.h"
-#include "opencv2/core/version.hpp"
 #include "opencv2/core/base.hpp"
 #include "opencv2/core/cvstd.hpp"
 #include "opencv2/core/traits.hpp"
@@ -97,6 +96,10 @@
         @}
         @defgroup core_lowlevel_api Low-level API for external libraries / plugins
     @}
+    @defgroup core_parallel Parallel Processing
+    @{
+        @defgroup core_parallel_backend Parallel backends API
+    @}
 @}
  */
 
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index a3a3e51e04..21a61a4e53 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -538,6 +538,16 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
  */
 CV_EXPORTS_W float cubeRoot(float val);
 
+/** @overload
+
+cubeRoot with argument of `double` type calls `std::cbrt(double)`
+*/
+static inline
+double cubeRoot(double val)
+{
+    return std::cbrt(val);
+}
+
 /** @brief Calculates the angle of a 2D vector in degrees.
 
  The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
diff --git a/modules/core/include/opencv2/core/bindings_utils.hpp b/modules/core/include/opencv2/core/bindings_utils.hpp
index f693dc8c65..cf8bcdd622 100644
--- a/modules/core/include/opencv2/core/bindings_utils.hpp
+++ b/modules/core/include/opencv2/core/bindings_utils.hpp
@@ -7,6 +7,9 @@
 
 #include <opencv2/core/async.hpp>
 #include <opencv2/core/detail/async_promise.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+#include <stdexcept>
 
 namespace cv { namespace utils {
 //! @addtogroup core_utils
@@ -58,6 +61,67 @@ String dumpCString(const char* argument)
     return cv::format("String: %s", argument);
 }
 
+CV_WRAP static inline
+String dumpString(const String& argument)
+{
+    return cv::format("String: %s", argument.c_str());
+}
+
+CV_WRAP static inline
+String testOverloadResolution(int value, const Point& point = Point(42, 24))
+{
+    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
+                  point.y);
+}
+
+CV_WRAP static inline
+String testOverloadResolution(const Rect& rect)
+{
+    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
+                  rect.width, rect.height);
+}
+
+CV_WRAP static inline
+String dumpRect(const Rect& argument)
+{
+    return format("rect: (x=%d, y=%d, w=%d, h=%d)", argument.x, argument.y,
+                  argument.width, argument.height);
+}
+
+CV_WRAP static inline
+String dumpTermCriteria(const TermCriteria& argument)
+{
+    return format("term_criteria: (type=%d, max_count=%d, epsilon=%lf",
+                  argument.type, argument.maxCount, argument.epsilon);
+}
+
+CV_WRAP static inline
+String dumpRotatedRect(const RotatedRect& argument)
+{
+    return format("rotated_rect: (c_x=%f, c_y=%f, w=%f, h=%f, a=%f)",
+                  argument.center.x, argument.center.y, argument.size.width,
+                  argument.size.height, argument.angle);
+}
+
+CV_WRAP static inline
+String dumpRange(const Range& argument)
+{
+    if (argument == Range::all())
+    {
+        return "range: all";
+    }
+    else
+    {
+        return format("range: (s=%d, e=%d)", argument.start, argument.end);
+    }
+}
+
+CV_WRAP static inline
+void testRaiseGeneralException()
+{
+    throw std::runtime_error("exception text");
+}
+
 CV_WRAP static inline
 AsyncArray testAsyncArray(InputArray argument)
 {
@@ -81,7 +145,30 @@ AsyncArray testAsyncException()
     return p.getArrayResult();
 }
 
-//! @}
-}} // namespace
+namespace fs {
+    CV_EXPORTS_W cv::String getCacheDirectoryForDownloads();
+} // namespace fs
+
+//! @}  // core_utils
+}  // namespace cv::utils
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+int setLogLevel(int level)
+{
+    // NB: Binding generators doesn't work with enums properly yet, so we define separate overload here
+    return cv::utils::logging::setLogLevel((cv::utils::logging::LogLevel)level);
+}
+
+CV_WRAP static inline
+int getLogLevel()
+{
+    return cv::utils::logging::getLogLevel();
+}
+
+//! @endcond IGNORED
+
+} // namespaces cv /  utils
 
 #endif // OPENCV_CORE_BINDINGS_UTILS_HPP
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 5fa09682e3..716b8bf2a8 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -340,6 +340,209 @@ public:
     Allocator* allocator;
 };
 
+struct CV_EXPORTS_W GpuData
+{
+    explicit GpuData(size_t _size);
+     ~GpuData();
+
+    GpuData(const GpuData&) = delete;
+    GpuData& operator=(const GpuData&) = delete;
+
+    GpuData(GpuData&&) = delete;
+    GpuData& operator=(GpuData&&) = delete;
+
+    uchar* data;
+    size_t size;
+};
+
+class CV_EXPORTS_W GpuMatND
+{
+public:
+    using SizeArray = std::vector<int>;
+    using StepArray = std::vector<size_t>;
+    using IndexArray = std::vector<int>;
+
+    //! destructor
+    ~GpuMatND();
+
+    //! default constructor
+    GpuMatND();
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    GpuMatND(SizeArray size, int type);
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray());
+
+    /** @brief Allocates GPU memory.
+    Suppose there is some GPU memory already allocated. In that case, this method may choose to reuse that
+    GPU memory under the specific condition: it must be of the same size and type, not externally allocated,
+    the GPU memory is continuous(i.e., isContinuous() is true), and is not a sub-matrix of another GpuMatND
+    (i.e., isSubmatrix() is false). In other words, this method guarantees that the GPU memory allocated by
+    this method is always continuous and is not a sub-region of another GpuMatND.
+    */
+    void create(SizeArray size, int type);
+
+    void release();
+
+    void swap(GpuMatND& m) noexcept;
+
+    /** @brief Creates a full copy of the array and the underlying data.
+    The method creates a full copy of the array. It mimics the behavior of Mat::clone(), i.e.
+    the original step is not taken into account. So, the array copy is a continuous array
+    occupying total()\*elemSize() bytes.
+    */
+    GpuMatND clone() const;
+
+    /** @overload
+    This overload is non-blocking, so it may return even if the copy operation is not finished.
+    */
+    GpuMatND clone(Stream& stream) const;
+
+    /** @brief Extracts a sub-matrix.
+    The operator makes a new header for the specified sub-array of \*this.
+    The operator is an O(1) operation, that is, no matrix data is copied.
+    @param ranges Array of selected ranges along each dimension.
+    */
+    GpuMatND operator()(const std::vector<Range>& ranges) const;
+
+    /** @brief Creates a GpuMat header for a 2D plane part of an n-dim matrix.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @overload
+    Creates a GpuMat header if this GpuMatND is effectively 2D.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader() const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix.
+    It differs from createGpuMatHeader(IndexArray, Range, Range) in that it clones a part of this
+    GpuMatND to the returned GpuMat.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    GpuMat operator()(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix if this GpuMatND is effectively 2D.
+    It differs from createGpuMatHeader() in that it clones a part of this GpuMatND.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    operator GpuMat() const;
+
+    GpuMatND(const GpuMatND&) = default;
+    GpuMatND& operator=(const GpuMatND&) = default;
+
+#if defined(__GNUC__) && __GNUC__ < 5
+    // error: function '...' defaulted on its first declaration with an exception-specification
+    // that differs from the implicit declaration '...'
+
+    GpuMatND(GpuMatND&&) = default;
+    GpuMatND& operator=(GpuMatND&&) = default;
+#else
+    GpuMatND(GpuMatND&&) noexcept = default;
+    GpuMatND& operator=(GpuMatND&&) noexcept = default;
+#endif
+
+    void upload(InputArray src);
+    void upload(InputArray src, Stream& stream);
+    void download(OutputArray dst) const;
+    void download(OutputArray dst, Stream& stream) const;
+
+    //! returns true iff the GpuMatND data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a sub-matrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes
+    size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    size_t elemSize1() const;
+
+    //! returns true if data is null
+    bool empty() const;
+
+    //! returns true if not empty and points to external(user-allocated) gpu memory
+    bool external() const;
+
+    //! returns pointer to the first byte of the GPU memory
+    uchar* getDevicePtr() const;
+
+    //! returns the total number of array elements
+    size_t total() const;
+
+    //! returns the size of underlying memory in bytes
+    size_t totalMemSize() const;
+
+    //! returns element type
+    int type() const;
+
+private:
+    //! internal use
+    void setFields(SizeArray size, int type, StepArray step = StepArray());
+
+public:
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! matrix dimensionality
+    int dims;
+
+    //! shape of this array
+    SizeArray size;
+
+    /*! step values
+    Their semantics is identical to the semantics of step for Mat.
+    */
+    StepArray step;
+
+private:
+    /*! internal use
+    If this GpuMatND holds external memory, this is empty.
+    */
+    std::shared_ptr<GpuData> data_;
+
+    /*! internal use
+    If this GpuMatND manages memory with reference counting, this value is
+    always equal to data_->data. If this GpuMatND holds external memory,
+    data_ is empty and data points to the external memory.
+    */
+    uchar* data;
+
+    /*! internal use
+    If this GpuMatND is a sub-matrix of a larger matrix, this value is the
+    difference of the first byte between the sub-matrix and the whole matrix.
+    */
+    size_t offset;
+};
+
 /** @brief Creates a continuous matrix.
 
 @param rows Row count.
@@ -656,6 +859,18 @@ public:
     //! creates a new asynchronous stream with custom allocator
     CV_WRAP Stream(const Ptr<GpuMat::Allocator>& allocator);
 
+    /** @brief creates a new Stream using the cudaFlags argument to determine the behaviors of the stream
+
+    @note The cudaFlags parameter is passed to the underlying api cudaStreamCreateWithFlags() and
+    supports the same parameter values.
+    @code
+        // creates an OpenCV cuda::Stream that manages an asynchronous, non-blocking,
+        // non-default CUDA stream
+        cv::cuda::Stream cvStream(cudaStreamNonBlocking);
+    @endcode
+     */
+    CV_WRAP Stream(const size_t cudaFlags);
+
     /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
     */
     CV_WRAP bool queryIfComplete() const;
diff --git a/modules/core/include/opencv2/core/cuda.inl.hpp b/modules/core/include/opencv2/core/cuda.inl.hpp
index 30fc0aee22..3f2a0c7240 100644
--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@@ -383,6 +383,92 @@ void swap(GpuMat& a, GpuMat& b)
     a.swap(b);
 }
 
+//===================================================================================
+// GpuMatND
+//===================================================================================
+
+inline
+GpuMatND::GpuMatND() :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+}
+
+inline
+GpuMatND::GpuMatND(SizeArray _size, int _type) :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+    create(std::move(_size), _type);
+}
+
+inline
+void GpuMatND::swap(GpuMatND& m) noexcept
+{
+    std::swap(*this, m);
+}
+
+inline
+bool GpuMatND::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool GpuMatND::isSubmatrix() const
+{
+    return (flags & Mat::SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t GpuMatND::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMatND::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+bool GpuMatND::empty() const
+{
+    return data == nullptr;
+}
+
+inline
+bool GpuMatND::external() const
+{
+    return !empty() && data_.use_count() == 0;
+}
+
+inline
+uchar* GpuMatND::getDevicePtr() const
+{
+    return data + offset;
+}
+
+inline
+size_t GpuMatND::total() const
+{
+    size_t p = 1;
+    for(auto s : size)
+        p *= s;
+    return p;
+}
+
+inline
+size_t GpuMatND::totalMemSize() const
+{
+    return size[0] * step[0];
+}
+
+inline
+int GpuMatND::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
 //===================================================================================
 // HostMem
 //===================================================================================
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index ef2b31ac18..fe15e51e4e 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -170,6 +170,7 @@
 
 #if defined CV_CPU_COMPILE_RVV
 #  define CV_RVV 1
+#  include <riscv_vector.h>
 #endif
 
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 6af58b6205..6a55995fc9 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -45,6 +45,8 @@
 #ifndef OPENCV_CORE_CVDEF_H
 #define OPENCV_CORE_CVDEF_H
 
+#include "opencv2/core/version.hpp"
+
 //! @addtogroup core_utils
 //! @{
 
@@ -388,7 +390,9 @@ typedef union Cv64suf
 }
 Cv64suf;
 
+#ifndef OPENCV_ABI_COMPATIBILITY
 #define OPENCV_ABI_COMPATIBILITY 400
+#endif
 
 #ifdef __OPENCV_BUILD
 #  define DISABLE_OPENCV_3_COMPATIBILITY
diff --git a/modules/core/include/opencv2/core/dualquaternion.hpp b/modules/core/include/opencv2/core/dualquaternion.hpp
new file mode 100644
index 0000000000..1f644e9dc8
--- /dev/null
+++ b/modules/core/include/opencv2/core/dualquaternion.hpp
@@ -0,0 +1,979 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#define OPENCV_CORE_DUALQUATERNION_HPP
+
+#include <opencv2/core/quaternion.hpp>
+#include <opencv2/core/affine.hpp>
+
+namespace cv{
+//! @addtogroup core
+//! @{
+
+template <typename _Tp> class DualQuat;
+template <typename _Tp> std::ostream& operator<<(std::ostream&, const DualQuat<_Tp>&);
+
+/**
+ * Dual quaternions were introduced to describe rotation together with translation while ordinary
+ * quaternions can only describe rotation. It can be used for shortest path pose interpolation,
+ * local pose optimization or volumetric deformation. More details can be found
+ * - https://en.wikipedia.org/wiki/Dual_quaternion
+ * - ["A beginners guide to dual-quaternions: what they are, how they work, and how to use them for 3D character hierarchies", Ben Kenwright, 2012](https://borodust.org/public/shared/beginner_dual_quats.pdf)
+ * - ["Dual Quaternions", Yan-Bin Jia, 2013](http://web.cs.iastate.edu/~cs577/handouts/dual-quaternion.pdf)
+ * - ["Geometric Skinning with Approximate Dual Quaternion Blending", Kavan, 2008](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric)
+ * - http://rodolphe-vaillant.fr/?e=29
+ *
+ * A unit dual quaternion can be classically represented as:
+ * \f[
+ * \begin{equation}
+ * \begin{split}
+ * \sigma &= \left(r+\frac{\epsilon}{2}tr\right)\\
+ * &= [w, x, y, z, w\_, x\_, y\_, z\_]
+ * \end{split}
+ * \end{equation}
+ * \f]
+ * where \f$r, t\f$ represents the rotation (ordinary unit quaternion) and translation (pure ordinary quaternion) respectively.
+ *
+ * A general dual quaternions which consist of two quaternions is usually represented in form of:
+ * \f[
+ * \sigma = p + \epsilon q
+ * \f]
+ * where the introduced dual unit \f$\epsilon\f$ satisfies \f$\epsilon^2 = \epsilon^3 =...=0\f$, and \f$p, q\f$ are quaternions.
+ *
+ * Alternatively, dual quaternions can also be interpreted as four components which are all [dual numbers](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric):
+ * \f[
+ * \sigma = \hat{q}_w + \hat{q}_xi + \hat{q}_yj + \hat{q}_zk
+ * \f]
+ * If we set \f$\hat{q}_x, \hat{q}_y\f$ and \f$\hat{q}_z\f$ equal to 0, a dual quaternion is transformed to a dual number. see normalize().
+ *
+ * If you want to create a dual quaternion, you can use:
+ *
+ * ```
+ * using namespace cv;
+ * double angle = CV_PI;
+ *
+ * // create from eight number
+ * DualQuatd dq1(1, 2, 3, 4, 5, 6, 7, 8); //p = [1,2,3,4]. q=[5,6,7,8]
+ *
+ * // create from Vec
+ * Vec<double, 8> v{1,2,3,4,5,6,7,8};
+ * DualQuatd dq_v{v};
+ *
+ * // create from two quaternion
+ * Quatd p(1, 2, 3, 4);
+ * Quatd q(5, 6, 7, 8);
+ * DualQuatd dq2 = DualQuatd::createFromQuat(p, q);
+ *
+ * // create from an angle, an axis and a translation
+ * Vec3d axis{0, 0, 1};
+ * Vec3d trans{3, 4, 5};
+ * DualQuatd dq3 = DualQuatd::createFromAngleAxisTrans(angle, axis, trans);
+ *
+ * // If you already have an instance of class Affine3, then you can use
+ * Affine3d R = dq3.toAffine3();
+ * DualQuatd dq4 = DualQuatd::createFromAffine3(R);
+ *
+ * // or create directly by affine transformation matrix Rt
+ * // see createFromMat() in detail for the form of Rt
+ * Matx44d Rt = dq3.toMat();
+ * DualQuatd dq5 = DualQuatd::createFromMat(Rt);
+ *
+ * // Any rotation + translation movement can
+ * // be expressed as a rotation + translation around the same line in space (expressed by Plucker
+ * // coords), and here's a way to represent it this way.
+ * Vec3d axis{1, 1, 1}; // axis will be normalized in createFromPitch
+ * Vec3d trans{3, 4 ,5};
+ * axis = axis / std::sqrt(axis.dot(axis));// The formula for computing moment that I use below requires a normalized axis
+ * Vec3d moment = 1.0 / 2 * (trans.cross(axis) + axis.cross(trans.cross(axis)) *
+ *                            std::cos(rotation_angle / 2) / std::sin(rotation_angle / 2));
+ * double d = trans.dot(qaxis);
+ * DualQuatd dq6 = DualQuatd::createFromPitch(angle, d, axis, moment);
+ * ```
+ *
+ * A point \f$v=(x, y, z)\f$ in form of dual quaternion is \f$[1+\epsilon v]=[1,0,0,0,0,x,y,z]\f$.
+ * The transformation of a point \f$v_1\f$ to another point \f$v_2\f$ under the dual quaternion \f$\sigma\f$ is
+ * \f[
+ * 1 + \epsilon v_2 = \sigma * (1 + \epsilon v_1) * \sigma^{\star}
+ * \f]
+ * where \f$\sigma^{\star}=p^*-\epsilon q^*.\f$
+ *
+ * A line in the \f$Pl\ddot{u}cker\f$ coordinates \f$(\hat{l}, m)\f$ defined by the dual quaternion \f$l=\hat{l}+\epsilon m\f$.
+ * To transform a line, \f[l_2 = \sigma * l_1 * \sigma^*,\f] where \f$\sigma=r+\frac{\epsilon}{2}rt\f$ and
+ * \f$\sigma^*=p^*+\epsilon q^*\f$.
+ *
+ * To extract the Vec<double, 8> or Vec<float, 8>, see toVec();
+ *
+ * To extract the affine transformation matrix, see toMat();
+ *
+ * To extract the instance of Affine3, see toAffine3();
+ *
+ * If two quaternions \f$q_0, q_1\f$ are needed to be interpolated, you can use sclerp()
+ * ```
+ * DualQuatd::sclerp(q0, q1, t)
+ * ```
+ * or dqblend().
+ * ```
+ * DualQuatd::dqblend(q0, q1, t)
+ * ```
+ * With more than two dual quaternions to be blended, you can use generalize linear dual quaternion blending
+ * with the corresponding weights, i.e. gdqblend().
+ *
+ */
+template <typename _Tp>
+class CV_EXPORTS DualQuat{
+    static_assert(std::is_floating_point<_Tp>::value, "Dual quaternion only make sense with type of float or double");
+    using value_type = _Tp;
+
+public:
+    static constexpr _Tp CV_DUAL_QUAT_EPS = (_Tp)1.e-6;
+
+    DualQuat();
+
+    /**
+     * @brief create from eight same type numbers.
+     */
+    DualQuat(const _Tp w, const _Tp x, const _Tp y, const _Tp z, const _Tp w_, const _Tp x_, const _Tp y_, const _Tp z_);
+
+    /**
+     * @brief create from a double or float vector.
+     */
+    DualQuat(const Vec<_Tp, 8> &q);
+
+    _Tp w, x, y, z, w_, x_, y_, z_;
+
+    /**
+     * @brief create Dual Quaternion from two same type quaternions p and q.
+     * A Dual Quaternion \f$\sigma\f$ has the form:
+     * \f[\sigma = p + \epsilon q\f]
+     * where p and q are defined as follows:
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    p &= w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\\
+     *    q &= w\_ + x\_\boldsymbol{i} + y\_\boldsymbol{j} + z\_\boldsymbol{k}.
+     *    \end{split}
+     *   \end{equation}
+     * \f]
+     * The p and q are the real part and dual part respectively.
+     * @param realPart a quaternion, real part of dual quaternion.
+     * @param dualPart a quaternion, dual part of dual quaternion.
+     * @sa Quat
+    */
+    static DualQuat<_Tp> createFromQuat(const Quat<_Tp> &realPart, const Quat<_Tp> &dualPart);
+
+    /**
+     * @brief create a dual quaternion from a rotation angle \f$\theta\f$, a rotation axis
+     * \f$\boldsymbol{u}\f$ and a translation \f$\boldsymbol{t}\f$.
+     * It generates a dual quaternion \f$\sigma\f$ in the form of
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    \sigma &= r + \frac{\epsilon}{2}\boldsymbol{t}r \\
+     *           &= [\cos(\frac{\theta}{2}), \boldsymbol{u}\sin(\frac{\theta}{2})]
+     *           + \frac{\epsilon}{2}[0, \boldsymbol{t}][[\cos(\frac{\theta}{2}),
+     *           \boldsymbol{u}\sin(\frac{\theta}{2})]]\\
+     *           &= \cos(\frac{\theta}{2}) + \boldsymbol{u}\sin(\frac{\theta}{2})
+     *           + \frac{\epsilon}{2}(-(\boldsymbol{t} \cdot \boldsymbol{u})\sin(\frac{\theta}{2})
+     *           + \boldsymbol{t}\cos(\frac{\theta}{2}) + \boldsymbol{u} \times \boldsymbol{t} \sin(\frac{\theta}{2})).
+     *    \end{split}
+     *    \end{equation}\f]
+     * @param angle rotation angle.
+     * @param axis rotation axis.
+     * @param translation a vector of length 3.
+     * @note Axis will be normalized in this function. And translation is applied
+     * after the rotation. Use @ref createFromQuat(r, r * t / 2) to create a dual quaternion
+     * which translation is applied before rotation.
+     * @sa Quat
+     */
+    static DualQuat<_Tp> createFromAngleAxisTrans(const _Tp angle, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &translation);
+
+    /**
+     * @brief Transform this dual quaternion to an affine transformation matrix \f$M\f$.
+     * Dual quaternion consists of a rotation \f$r=[a,b,c,d]\f$ and a translation \f$t=[\Delta x,\Delta y,\Delta z]\f$. The
+     * affine transformation matrix \f$M\f$ has the form
+     * \f[
+     * \begin{bmatrix}
+     * 1-2(e_2^2 +e_3^2) &2(e_1e_2-e_0e_3) &2(e_0e_2+e_1e_3) &\Delta x\\
+     * 2(e_0e_3+e_1e_2)  &1-2(e_1^2+e_3^2) &2(e_2e_3-e_0e_1) &\Delta y\\
+     * 2(e_1e_3-e_0e_2)  &2(e_0e_1+e_2e_3) &1-2(e_1^2-e_2^2) &\Delta z\\
+     * 0&0&0&1
+     * \end{bmatrix}
+     * \f]
+     *  if A is a matrix consisting of  n points to be transformed, this could be achieved by
+     * \f[
+     *  new\_A = M * A
+     * \f]
+     * where A has the form
+     * \f[
+     * \begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n\\
+     * 1&1&1&...&1
+     * \end{bmatrix}
+     * \f]
+     * where the same subscript represent the same point. The size of A should be \f$[4,n]\f$.
+     * and the same size for matrix new_A.
+     * @param _R 4x4 matrix that represents rotations and translation.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromMat(InputArray _R);
+
+    /**
+     * @brief create dual quaternion from an affine matrix. The definition of affine matrix can refer to  createFromMat()
+     */
+    static DualQuat<_Tp> createFromAffine3(const Affine3<_Tp> &R);
+
+    /**
+     * @brief A dual quaternion is a vector in form of
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &=\boldsymbol{p} + \epsilon \boldsymbol{q}\\
+     * &= \cos\hat{\frac{\theta}{2}}+\overline{\hat{l}}\sin\frac{\hat{\theta}}{2}
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\hat{\theta}\f$ is dual angle and \f$\overline{\hat{l}}\f$ is dual axis:
+     * \f[
+     * \hat{\theta}=\theta + \epsilon d,\\
+     * \overline{\hat{l}}= \hat{l} +\epsilon m.
+     * \f]
+     * In this representation, \f$\theta\f$ is rotation angle and \f$(\hat{l},m)\f$ is the screw axis, d is the translation distance along the axis.
+     *
+     * @param angle rotation angle.
+     * @param d translation along the rotation axis.
+     * @param axis rotation axis represented by quaternion with w = 0.
+     * @param moment the moment of line, and it should be orthogonal to axis.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromPitch(const _Tp angle, const _Tp d, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &moment);
+
+    /**
+     * @brief return a quaternion which represent the real part of dual quaternion.
+     * The definition of real part is in createFromQuat().
+     * @sa createFromQuat, getDualPart
+     */
+    Quat<_Tp> getRealPart() const;
+
+    /**
+     * @brief return a quaternion which represent the dual part of dual quaternion.
+     * The definition of dual part is in createFromQuat().
+     * @sa createFromQuat, getRealPart
+     */
+    Quat<_Tp> getDualPart() const;
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> conjugate(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     */
+    DualQuat<_Tp> conjugate() const;
+
+    /**
+     * @brief return the rotation in quaternion form.
+     */
+    Quat<_Tp> getRotation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the translation vector.
+     * The rotation \f$r\f$ in this dual quaternion \f$\sigma\f$ is applied before translation \f$t\f$.
+     * The dual quaternion \f$\sigma\f$ is defined as
+     * \f[\begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q \\
+     *        &= r + \frac{\epsilon}{2}{t}r.
+     * \end{split}
+     * \end{equation}\f]
+     * Thus, the translation can be obtained as follows
+     * \f[t = 2qp^*.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     * @note This dual quaternion's translation is applied after the rotation.
+     */
+    Vec<_Tp, 3> getTranslation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the norm \f$||\sigma||\f$ of dual quaternion \f$\sigma = p + \epsilon q\f$.
+     * \f[
+     *  \begin{equation}
+     *  \begin{split}
+     *  ||\sigma|| &= \sqrt{\sigma * \sigma^*} \\
+     *        &= ||p|| + \epsilon \frac{p \cdot q}{||p||}.
+     *  \end{split}
+     *  \end{equation}
+     *  \f]
+     * Generally speaking, the norm of a not unit dual
+     * quaternion is a dual number. For convenience, we return it in the form of a dual quaternion
+     * , i.e.
+     * \f[ ||\sigma|| = [||p||, 0, 0, 0, \frac{p \cdot q}{||p||}, 0, 0, 0].\f]
+     *
+     * @note The data type of dual number is dual quaternion.
+     */
+    DualQuat<_Tp> norm() const;
+
+    /**
+     * @brief return a normalized dual quaternion.
+     * A dual quaternion can be expressed as
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q\\
+     * &=||\sigma||\left(r+\frac{1}{2}tr\right)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$r, t\f$ represents the rotation (ordinary quaternion) and translation (pure ordinary quaternion) respectively,
+     * and \f$||\sigma||\f$ is the norm of dual quaternion(a dual number).
+     * A dual quaternion is unit if and only if
+     * \f[
+     * ||p||=1, p \cdot q=0
+     * \f]
+     * where \f$\cdot\f$ means dot product.
+     * The process of normalization is
+     * \f[
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}
+     * \f]
+     * Next, we simply proof \f$\sigma_u\f$ is a unit dual quaternion:
+     * \f[
+     * \renewcommand{\Im}{\operatorname{Im}}
+     * \begin{equation}
+     * \begin{split}
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}&=\frac{p + \epsilon q}{||p||+\epsilon\frac{p\cdot q}{||p||}}\\
+     * &=\frac{p}{||p||}+\epsilon\left(\frac{q}{||p||}-p\frac{p\cdot q}{||p||^3}\right)\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\left(qp^{*}-p\cdot q\right)\frac{p}{||p||}\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\Im(qp^*)\frac{p}{||p||}.\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * As expected, the real part is a rotation and dual part is a pure quaternion.
+     */
+    DualQuat<_Tp> normalize() const;
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> inv(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the dot product of two dual quaternion.
+     * @param p other dual quaternion.
+     */
+    _Tp dot(DualQuat<_Tp> p) const;
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     * @param dq a dual quaternion.
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit);
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     *
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     * @param p a dual quaternion.
+     * @param q a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion p assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T>& p, const DualQuat<T>& q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     *
+     * @param q a dual quaternion
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const DualQuat<_Tp>& q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of exponential function value
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> exp(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the value of exponential function value
+     */
+    DualQuat<_Tp> exp() const;
+
+    /**
+     * @brief return the value of logarithm function value
+     *
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of logarithm function value
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> log(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief Transform this dual quaternion to a vector.
+     */
+    Vec<_Tp, 8> toVec() const;
+
+    /**
+     * @brief Transform this dual quaternion to a affine transformation matrix
+     * the form of matrix, see createFromMat().
+     */
+    Matx<_Tp, 4, 4> toMat(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+      * @brief Transform this dual quaternion to a instance of Affine3.
+      */
+    Affine3<_Tp> toAffine3(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief The screw linear interpolation(ScLERP) is an extension of spherical linear interpolation of dual quaternion.
+     * If \f$\sigma_1\f$ and \f$\sigma_2\f$ are two dual quaternions representing the initial and final pose.
+     * The interpolation of ScLERP function can be defined as:
+     * \f[
+     * ScLERP(t;\sigma_1,\sigma_2) = \sigma_1 * (\sigma_1^{-1} * \sigma_2)^t, t\in[0,1]
+     * \f]
+     *
+     * @param q1 a dual quaternion represents a initial pose.
+     * @param q2 a dual quaternion represents a final pose.
+     * @param t interpolation parameter
+     * @param directChange if true, it always return the shortest path.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * For example
+     * ```
+     * double angle1 = CV_PI / 2;
+     * Vec3d axis{0, 0, 1};
+     * Vec3d t(0, 0, 3);
+     * DualQuatd initial = DualQuatd::createFromAngleAxisTrans(angle1, axis, t);
+     * double angle2 = CV_PI;
+     * DualQuatd final = DualQuatd::createFromAngleAxisTrans(angle2, axis, t);
+     * DualQuatd inter = DualQuatd::sclerp(initial, final, 0.5);
+     * ```
+     */
+    static DualQuat<_Tp> sclerp(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                bool directChange=true, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+    /**
+     * @brief The method of Dual Quaternion linear Blending(DQB) is to compute a transformation between dual quaternion
+     * \f$q_1\f$ and \f$q_2\f$ and can be defined as:
+     * \f[
+     * DQB(t;{\boldsymbol{q}}_1,{\boldsymbol{q}}_2)=
+     * \frac{(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2}{||(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2||}.
+     * \f]
+     * where \f$q_1\f$ and \f$q_2\f$ are unit dual quaternions representing the input transformations.
+     * If you want to use DQB that works for more than two rigid transformations, see @ref gdqblend
+     *
+     * @param q1 a unit dual quaternion representing the input transformations.
+     * @param q2 a unit dual quaternion representing the input transformations.
+     * @param t parameter \f$t\in[0,1]\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * @sa gdqblend
+     */
+    static DualQuat<_Tp> dqblend(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                   QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat vector of dual quaternions
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    template <int cn>
+    static DualQuat<_Tp> gdqblend(const Vec<DualQuat<_Tp>, cn> &dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat The dual quaternions which have 8 channels and 1 row or 1 col.
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    static DualQuat<_Tp> gdqblend(InputArray dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief Return opposite dual quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * DualQuatd q{1, 2, 3, 4, 5, 6, 7, 8};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     */
+    DualQuat<_Tp> operator-() const;
+
+    /**
+     * @brief return true if two dual quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_DUAL_QUAT_EPS.
+     */
+    bool operator==(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     * ```
+     */
+    DualQuat<_Tp> operator-(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction assignment operator of two dual quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator-=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Addition operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     * ```
+     */
+    DualQuat<_Tp> operator+(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Addition assignment operator of two dual quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator+=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of two quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p *= q;
+     * std::cout << p << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp>& operator*=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     *  &=[w   s, x   s, y   s, z   s, w\_  \space  s, x\_  \space  s, y\_ \space  s, z\_ \space  s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p *= s;
+     * std::cout << p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    DualQuat<_Tp> operator*=(const _Tp s);
+
+
+    /**
+     * @brief Multiplication operator of two dual quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp> operator*(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_, z\_] / s\\
+     * &=[w/s, x/s, y/s, z/s, w\_/s, x\_/s, y\_/s, z\_/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
+     * ```
+     * @note the type of scalar should be equal to this dual quaternion.
+     */
+    DualQuat<_Tp> operator/(const _Tp s) const;
+
+    /**
+     * @brief Division operator of two dual quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of dual quaternion division with a dual quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
+    DualQuat<_Tp> operator/(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division assignment operator of two dual quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
+    DualQuat<_Tp>& operator/=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Division assignment operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_ ,z\_] / s\\
+     * &=[w / s, x / s, y / s, z / s, w\_ / \space s, x\_ / \space s, y\_ / \space s, z\_ / \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
+
+    /**
+     * @brief Addition operator of a scalar and a dual quaternions.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Addition operator of a dual quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a dual quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Subtraction operator of a dual quaternion and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Subtraction operator of a scalar and a dual quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Multiplication operator of a dual quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const DualQuat<T>&, const T s);
+
+    template <typename S>
+    friend std::ostream& cv::operator<<(std::ostream&, const DualQuat<S>&);
+
+};
+
+using DualQuatd = DualQuat<double>;
+using DualQuatf = DualQuat<float>;
+
+//! @} core
+}//namespace
+
+#include "dualquaternion.inl.hpp"
+
+#endif /* OPENCV_CORE_QUATERNION_HPP */
diff --git a/modules/core/include/opencv2/core/dualquaternion.inl.hpp b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
new file mode 100644
index 0000000000..4aec961dd2
--- /dev/null
+++ b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
@@ -0,0 +1,487 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+
+#ifndef OPENCV_CORE_DUALQUATERNION_INL_HPP
+#define OPENCV_CORE_DUALQUATERNION_INL_HPP
+
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#error This is not a standalone header. Include dualquaternion.hpp instead.
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////
+//Implementation
+namespace cv {
+
+template <typename T>
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+
+template <typename T>
+DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+
+template <typename T>
+DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
+{
+    T w = realPart.w;
+    T x = realPart.x;
+    T y = realPart.y;
+    T z = realPart.z;
+    T w_ = dualPart.w;
+    T x_ = dualPart.x;
+    T y_ = dualPart.y;
+    T z_ = dualPart.z;
+    return DualQuat<T>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAngleAxisTrans(const T angle, const Vec<T, 3> &axis, const Vec<T, 3> &trans)
+{
+    Quat<T> r = Quat<T>::createFromAngleAxis(angle, axis);
+    Quat<T> t{0, trans[0], trans[1], trans[2]};
+    return createFromQuat(r, t * r / 2);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromMat(InputArray _R)
+{
+    CV_CheckTypeEQ(_R.type(), cv::traits::Type<T>::value, "");
+    if (_R.size() != Size(4, 4))
+    {
+        CV_Error(Error::StsBadArg, "The input matrix must have 4 columns and 4 rows");
+    }
+    Mat R = _R.getMat();
+    Quat<T> r = Quat<T>::createFromRotMat(R.colRange(0, 3).rowRange(0, 3));
+    Quat<T> trans(0, R.at<T>(0, 3), R.at<T>(1, 3), R.at<T>(2, 3));
+    return createFromQuat(r, trans * r / 2);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAffine3(const Affine3<T> &R)
+{
+    return createFromMat(R.matrix);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromPitch(const T angle, const T d, const Vec<T, 3> &axis, const Vec<T, 3> &moment)
+{
+    T half_angle = angle / 2, half_d = d / 2;
+    Quat<T> qaxis = Quat<T>(0, axis[0], axis[1], axis[2]).normalize();
+    Quat<T> qmoment = Quat<T>(0, moment[0], moment[1], moment[2]);
+    qmoment -= qaxis * axis.dot(moment);
+    Quat<T> dual = -half_d * std::sin(half_angle) + std::sin(half_angle) * qmoment +
+        half_d * std::cos(half_angle) * qaxis;
+    return createFromQuat(Quat<T>::createFromAngleAxis(angle, axis), dual);
+}
+
+template <typename T>
+inline bool DualQuat<T>::operator==(const DualQuat<T> &q) const
+{
+    return (abs(w - q.w) < CV_DUAL_QUAT_EPS && abs(x - q.x) < CV_DUAL_QUAT_EPS &&
+            abs(y - q.y) < CV_DUAL_QUAT_EPS && abs(z - q.z) < CV_DUAL_QUAT_EPS &&
+            abs(w_ - q.w_) < CV_DUAL_QUAT_EPS && abs(x_ - q.x_) < CV_DUAL_QUAT_EPS &&
+            abs(y_ - q.y_) < CV_DUAL_QUAT_EPS && abs(z_ - q.z_) < CV_DUAL_QUAT_EPS);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRealPart() const
+{
+    return Quat<T>(w, x, y, z);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getDualPart() const
+{
+    return Quat<T>(w_, x_, y_, z_);
+}
+
+template <typename T>
+inline DualQuat<T> conjugate(const DualQuat<T> &dq)
+{
+    return dq.conjugate();
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::conjugate() const
+{
+    return DualQuat<T>(w, -x, -y, -z, w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::norm() const
+{
+    Quat<T> real = getRealPart();
+    T realNorm = real.norm();
+    Quat<T> dual = getDualPart();
+    if (realNorm < CV_DUAL_QUAT_EPS){
+        return DualQuat<T>(0, 0, 0, 0, 0, 0, 0, 0);
+    }
+    return DualQuat<T>(realNorm, 0, 0, 0, real.dot(dual) / realNorm, 0, 0, 0);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRotation(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return getRealPart();
+    }
+    return getRealPart().normalize();
+}
+
+template <typename T>
+inline Vec<T, 3> DualQuat<T>::getTranslation(QuatAssumeType assumeUnit) const
+{
+    Quat<T> trans = 2.0 * (getDualPart() * getRealPart().inv(assumeUnit));
+    return Vec<T, 3>{trans[1], trans[2], trans[3]};
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::normalize() const
+{
+    Quat<T> p = getRealPart();
+    Quat<T> q = getDualPart();
+    T p_norm = p.norm();
+    if (p_norm < CV_DUAL_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot normalize this dual quaternion: the norm is too small.");
+    }
+    Quat<T> p_nr = p / p_norm;
+    Quat<T> q_nr = q / p_norm;
+    return createFromQuat(p_nr, q_nr - p_nr * p_nr.dot(q_nr));
+}
+
+template <typename T>
+inline T DualQuat<T>::dot(DualQuat<T> q) const
+{
+    return q.w * w + q.x * x + q.y * y + q.z * z + q.w_ * w_ + q.x_ * x_ + q.y_ * y_ + q.z_ * z_;
+}
+
+template <typename T>
+inline DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.inv(assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::inv(QuatAssumeType assumeUnit) const
+{
+    Quat<T> real = getRealPart();
+    Quat<T> dual = getDualPart();
+    return createFromQuat(real.inv(assumeUnit), -real.inv(assumeUnit) * dual * real.inv(assumeUnit));
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w - q.w, x - q.x, y - q.y, z - q.z, w_ - q.w_, x_ - q.x_, y_ - q.y_, z_ - q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-() const
+{
+    return DualQuat<T>(-w, -x, -y, -z, -w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator+(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w + q.w, x + q.x, y + q.y, z + q.z, w_ + q.w_, x_ + q.x_, y_ + q.y_, z_ + q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator+=(const DualQuat<T> &q)
+{
+    *this = *this + q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator*(const DualQuat<T> &q) const
+{
+    Quat<T> A = getRealPart();
+    Quat<T> B = getDualPart();
+    Quat<T> C = q.getRealPart();
+    Quat<T> D = q.getDualPart();
+    return DualQuat<T>::createFromQuat(A * C, A * D + B * C);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator*=(const DualQuat<T> &q)
+{
+    *this = *this * q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w - a, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator-=(const DualQuat<T> &q)
+{
+    *this = *this - q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a - q.w, -q.x, -q.y, -q.z, -q.w_, -q.x_, -q.y_, -q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const T a) const
+{
+    return DualQuat<T>(w / a, x / a, y / a, z / a, w_ / a, x_ / a, y_ / a, z_ / a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const DualQuat<T> &q) const
+{
+    return *this * q.inv();
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator/=(const DualQuat<T> &q)
+{
+    *this = *this / q;
+    return *this;
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &os, const DualQuat<T> &q)
+{
+    os << "DualQuat " << Vec<T, 8>{q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_};
+    return os;
+}
+
+template <typename T>
+inline DualQuat<T> exp(const DualQuat<T> &dq)
+{
+    return dq.exp();
+}
+
+namespace detail {
+
+template <typename _Tp>
+Matx<_Tp, 4, 4> jacob_exp(const Quat<_Tp> &q)
+{
+    _Tp nv = std::sqrt(q.x * q.x + q.y * q.y + q.z * q.z);
+    _Tp sinc_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? 1 - nv * nv / 6 : std::sin(nv) / nv;
+    _Tp csiii_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? -(_Tp)1.0 / 3 : (std::cos(nv) - sinc_nv) / nv / nv;
+    Matx<_Tp, 4, 4> J_exp_quat {
+        std::cos(nv), -sinc_nv * q.x,  -sinc_nv * q.y,  -sinc_nv * q.z,
+        sinc_nv * q.x, csiii_nv * q.x * q.x + sinc_nv, csiii_nv * q.x * q.y, csiii_nv * q.x * q.z,
+        sinc_nv * q.y, csiii_nv * q.y * q.x, csiii_nv * q.y * q.y + sinc_nv, csiii_nv * q.y * q.z,
+        sinc_nv * q.z, csiii_nv * q.z * q.x, csiii_nv * q.z * q.y, csiii_nv * q.z * q.z + sinc_nv
+    };
+    return std::exp(q.w) * J_exp_quat;
+}
+
+} // namespace detail
+
+template <typename T>
+DualQuat<T> DualQuat<T>::exp() const
+{
+    Quat<T> real = getRealPart();
+    return createFromQuat(real.exp(), Quat<T>(detail::jacob_exp(real) * getDualPart().toVec()));
+}
+
+template <typename T>
+DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.log(assumeUnit);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::log(QuatAssumeType assumeUnit) const
+{
+    Quat<T> plog = getRealPart().log(assumeUnit);
+    Matx<T, 4, 4> jacob = detail::jacob_exp(plog);
+    return createFromQuat(plog, Quat<T>(jacob.inv() * getDualPart().toVec()));
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.power(t, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const T t, QuatAssumeType assumeUnit) const
+{
+    return (t * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &p, const DualQuat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return p.power(q, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const DualQuat<T> &q, QuatAssumeType assumeUnit) const
+{
+    return (q * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline Vec<T, 8> DualQuat<T>::toVec() const
+{
+   return Vec<T, 8>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+Affine3<T> DualQuat<T>::toAffine3(QuatAssumeType assumeUnit) const
+{
+    return Affine3<T>(toMat(assumeUnit));
+}
+
+template <typename T>
+Matx<T, 4, 4> DualQuat<T>::toMat(QuatAssumeType assumeUnit) const
+{
+    Matx<T, 4, 4> rot44 = getRotation(assumeUnit).toRotMat4x4();
+    Vec<T, 3> translation = getTranslation(assumeUnit);
+    rot44(0, 3) = translation[0];
+    rot44(1, 3) = translation[1];
+    rot44(2, 3) = translation[2];
+    return rot44;
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::sclerp(const DualQuat<T> &q0, const DualQuat<T> &q1, const T t, bool directChange, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v0(q0), v1(q1);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+    }
+    Quat<T> v0Real = v0.getRealPart();
+    Quat<T> v1Real = v1.getRealPart();
+    if (directChange && v1Real.dot(v0Real) < 0)
+    {
+        v0 = -v0;
+    }
+    DualQuat<T> v0inv1 = v0.inv() * v1;
+    return v0 * v0inv1.power(t, QUAT_ASSUME_UNIT);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::dqblend(const DualQuat<T> &q1, const DualQuat<T> &q2, const T t, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v1(q1), v2(q2);
+    if (!assumeUnit)
+    {
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+    }
+    if (v1.getRotation(assumeUnit).dot(v2.getRotation(assumeUnit)) < 0)
+    {
+        return ((1 - t) * v1 - t * v2).normalize();
+    }
+    return ((1 - t) * v1 + t * v2).normalize();
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::gdqblend(InputArray _dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    CV_CheckTypeEQ(_weight.type(), cv::traits::Type<T>::value, "");
+    CV_CheckTypeEQ(_dualquat.type(), CV_MAKETYPE(CV_MAT_DEPTH(cv::traits::Type<T>::value), 8), "");
+    Size dq_s = _dualquat.size();
+    if (dq_s != _weight.size() || (dq_s.height != 1 && dq_s.width != 1))
+    {
+        CV_Error(Error::StsBadArg, "The size of weight must be the same as dualquat, both of them should be (1, n) or (n, 1)");
+    }
+    Mat dualquat = _dualquat.getMat(), weight = _weight.getMat();
+    const int cn = std::max(dq_s.width, dq_s.height);
+    if (!assumeUnit)
+    {
+        for (int i = 0; i < cn; ++i)
+        {
+            dualquat.at<Vec<T, 8>>(i) = DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.normalize().toVec();
+        }
+    }
+    Vec<T, 8> dq_blend = dualquat.at<Vec<T, 8>>(0) * weight.at<T>(0);
+    Quat<T> q0 = DualQuat<T> {dualquat.at<Vec<T, 8>>(0)}.getRotation(assumeUnit);
+    for (int i = 1; i < cn; ++i)
+    {
+        T k = q0.dot(DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.getRotation(assumeUnit)) < 0 ? -1: 1;
+        dq_blend = dq_blend + dualquat.at<Vec<T, 8>>(i) * k * weight.at<T>(i);
+    }
+    return DualQuat<T>{dq_blend}.normalize();
+}
+
+template <typename T>
+template <int cn>
+DualQuat<T> DualQuat<T>::gdqblend(const Vec<DualQuat<T>, cn> &_dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    Vec<DualQuat<T>, cn> dualquat(_dualquat);
+    if (cn == 0)
+    {
+        return DualQuat<T>(1, 0, 0, 0, 0, 0, 0, 0);
+    }
+    Mat dualquat_mat(cn, 1, CV_64FC(8));
+    for (int i = 0; i < cn ; ++i)
+    {
+        dualquat_mat.at<Vec<T, 8>>(i) = dualquat[i].toVec();
+    }
+    return gdqblend(dualquat_mat, _weight, assumeUnit);
+}
+
+} //namespace cv
+
+#endif /*OPENCV_CORE_DUALQUATERNION_INL_HPP*/
diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index 0f53cf5c1b..eb4fbe213b 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -76,6 +76,9 @@
   #if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 \
       && !defined(OPENCV_SKIP_INCLUDE_ALTIVEC_H)
     #include <altivec.h>
+    #undef vector
+    #undef bool
+    #undef pixel
   #endif
 
   #if defined(CV_INLINE_ROUND_FLT)
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 0ffcb49cea..6f5b8e1788 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -104,7 +104,7 @@ template<typename _Tp> struct V_TypeTraits
 {
 };
 
-#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
     template<> struct V_TypeTraits<type> \
     { \
         typedef type value_type; \
@@ -114,7 +114,6 @@ template<typename _Tp> struct V_TypeTraits
         typedef w_type_ w_type; \
         typedef q_type_ q_type; \
         typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
     \
         static inline int_type reinterpret_int(type x) \
         { \
@@ -131,7 +130,7 @@ template<typename _Tp> struct V_TypeTraits
         } \
     }
 
-#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
     template<> struct V_TypeTraits<type> \
     { \
         typedef type value_type; \
@@ -140,7 +139,6 @@ template<typename _Tp> struct V_TypeTraits
         typedef uint_type_ uint_type; \
         typedef w_type_ w_type; \
         typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
     \
         static inline int_type reinterpret_int(type x) \
         { \
@@ -157,16 +155,16 @@ template<typename _Tp> struct V_TypeTraits
         } \
     }
 
-CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
-CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
 
 #ifndef CV_DOXYGEN
 
@@ -202,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif
 
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -314,54 +312,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 //==================================================================================================
 
-#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
-    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
-    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
-    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
-    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
-    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
-    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
-    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
-    inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
-    inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
-
 template<typename _Tp> struct V_RegTraits
 {
 };
@@ -421,6 +371,7 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
     CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
 #endif
+//! @endcond
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
@@ -429,21 +380,33 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD_64F CV_SIMD512_64F
     #define CV_SIMD_FP16 CV_SIMD512_FP16
     #define CV_SIMD_WIDTH 64
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x64    v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x64     v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x32   v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x32    v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x16   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x16    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x8    v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x8     v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x16  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
-#if CV_SIMD512_64F
+    #if CV_SIMD512_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x8   v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
-#endif
-        inline void vx_cleanup() { v512_cleanup(); }
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v512##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
@@ -453,21 +416,33 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD_64F CV_SIMD256_64F
     #define CV_SIMD_FP16 CV_SIMD256_FP16
     #define CV_SIMD_WIDTH 32
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x32   v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x32    v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x16  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x16   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x8   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x8    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x4   v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x4    v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x8  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
     #if CV_SIMD256_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x4  v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
     #endif
-    inline void vx_cleanup() { v256_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v256##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
@@ -480,25 +455,228 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD CV_SIMD128
     #define CV_SIMD_64F CV_SIMD128_64F
     #define CV_SIMD_WIDTH 16
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x16  v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x16   v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x8  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x8   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x4  v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x4   v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x2  v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x2   v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x4 v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
     #if CV_SIMD128_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x2 v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
     #endif
-    inline void vx_cleanup() { v_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #endif
 
+namespace CV__SIMD_NAMESPACE {
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @name Wide init with value
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to a specific value
+    inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
+    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
+    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
+    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
+    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
+    inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
+    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
+    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
+    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
+#endif
+    //! @}
+
+    //! @name Wide init with zero
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to zero
+    inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
+    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
+    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
+    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
+    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
+    inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
+    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
+    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
+    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory
+    inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory(aligned)
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory(aligned)
+    inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load lower half from memory
+    //! @{
+    //! @brief Load lower half of maximum available capacity register from memory
+    inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load halfs from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from two memory blocks
+    inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of elements
+    //! @{
+    //! @brief Load maximum available capacity register contents with array elements by provided indexes
+    inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element pairs
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
+    inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element quads
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element quads by provided indexes
+    inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    //! @}
+
+    //! @name Wide load with double expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with double expand
+    inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    //! @}
+
+    //! @name Wide load with quad expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with quad expand
+    inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    //! @}
+
+    /** @brief SIMD processing state cleanup call */
+    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+
+
+//! @cond IGNORED
+
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
+
+//! @endcond
+
+
+//! @}
+    #undef VXPREFIX
+} // namespace
+
+//! @cond IGNORED
 #ifndef CV_SIMD_64F
 #define CV_SIMD_64F 0
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 859bfd72dc..46222140e6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -52,10 +52,21 @@
 
 //! @cond IGNORED
 #define CV_SIMD128_CPP 1
-#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN)
+#if defined(CV_FORCE_SIMD128_CPP)
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1
 #endif
+#if defined(CV_DOXYGEN)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#else
+#define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
+#define CV_SIMD512 0 // to avoid warnings during compilation
+#endif
 //! @endcond
 
 namespace cv
@@ -68,17 +79,33 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 /** @addtogroup core_hal_intrin
 
 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
-different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
-architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
-containing packed values of different types. In case when there is no SIMD extension available
-during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
-expected although it could be slower.
+different platforms. Currently a few different SIMD extensions on different architectures are supported.
+128 bit registers of various types support is implemented for a wide range of architectures
+including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
+256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
+In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
+will be chosen and code will work as expected although it could be slower.
 
 ### Types
 
-There are several types representing 128-bit register as a vector of packed values, each type is
+There are several types representing packed values vector registers, each type is
 implemented as a structure based on a one SIMD register.
 
+- cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32: 32-bit floating point values (signed) - float
+- cv::v_float64: 64-bit floating point values (signed) - double
+
+Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
+capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
+to check for exact value quantity of the type.
+
+In case the exact bit length of the type is important it is possible to use specific fixed length register types.
+
+There are several types representing 128-bit registers.
+
 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
@@ -86,28 +113,96 @@ implemented as a structure based on a one SIMD register.
 - cv::v_float32x4: four 32-bit floating point values (signed) - float
 - cv::v_float64x2: two 64-bit floating point values (signed) - double
 
+There are several types representing 256-bit registers.
+
+- cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x8: eight 32-bit floating point values (signed) - float
+- cv::v_float64x4: four 64-bit floating point values (signed) - double
+
 @note
-cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
-check the CV_SIMD128_64F preprocessor definition:
+256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD256 preprocessor definition:
 @code
-#if CV_SIMD128_64F
+#if CV_SIMD256
 //...
 #endif
 @endcode
 
+There are several types representing 512-bit registers.
+
+- cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
+- cv::v_float64x8: eight 64-bit floating point values (signed) - double
+@note
+512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD512 preprocessor definition.
+
+@note
+cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
+check the CV_SIMD128_64F preprocessor definition.
+
 ### Load and store operations
 
 These operations allow to set contents of the register explicitly or by loading it from some memory
 block and to save contents of the register to memory block.
 
+There are variable size register load operations that provide result of maximum available size
+depending on chosen platform capabilities.
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+- Other create methods:
+vx_setall_s8, vx_setall_u8, ...,
+vx_setzero_u8, vx_setzero_s8, ...
+- Memory load operations:
+vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
+- Memory operations with expansion of values:
+vx_load_expand, vx_load_expand_q
+
+Also there are fixed size register load/store operations.
+
+For 128 bit registers
 - Constructors:
 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
 - Other create methods:
 @ref v_setall_s8, @ref v_setall_u8, ...,
 @ref v_setzero_u8, @ref v_setzero_s8, ...
-- Memory operations:
+- Memory load operations:
 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
+- Memory operations with expansion of values:
+@ref v_load_expand, @ref v_load_expand_q
+
+For 256 bit registers(check CV_SIMD256 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
+- Other create methods:
+@ref v256_setall_s8, @ref v256_setall_u8, ...,
+@ref v256_setzero_u8, @ref v256_setzero_s8, ...
+- Memory load operations:
+@ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
+- Memory operations with expansion of values:
+@ref v256_load_expand, @ref v256_load_expand_q
+
+For 512 bit registers(check CV_SIMD512 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
+- Other create methods:
+@ref v512_setall_s8, @ref v512_setall_u8, ...,
+@ref v512_setzero_u8, @ref v512_setzero_s8, ...
+- Memory load operations:
+@ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
+- Memory operations with expansion of values:
+@ref v512_load_expand, @ref v512_load_expand_q
+
+Store to memory operations are similar across different platform capabilities:
 @ref v_store, @ref v_store_aligned,
 @ref v_store_high, @ref v_store_low
 
@@ -116,7 +211,7 @@ block and to save contents of the register to memory block.
 These operations allow to reorder or recombine elements in one or multiple vectors.
 
 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
-- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
+- Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
@@ -153,7 +248,7 @@ Element-wise binary and unary operations.
 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
 @ref operator <(const v_reg &a, const v_reg &b) "<",
 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
-@ref operator==(const v_reg &a, const v_reg &b) "==",
+@ref operator ==(const v_reg &a, const v_reg &b) "==",
 @ref operator !=(const v_reg &a, const v_reg &b) "!="
 
 - min/max: @ref v_min, @ref v_max
@@ -190,7 +285,7 @@ shows the applicability of different operations to the types.
 
 Regular integers:
 
-| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
+| Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
 |load, store        | x | x | x | x | x | x |
 |interleave         | x | x | x | x | x | x |
@@ -230,7 +325,7 @@ Regular integers:
 
 Big integers:
 
-| Operations\\Types | uint 64x2 | int 64x2 |
+| Operations\\Types | uint 64 | int 64 |
 |-------------------|:-:|:-:|
 |load, store        | x | x |
 |add, sub           | x | x |
@@ -244,7 +339,7 @@ Big integers:
 
 Floating point:
 
-| Operations\\Types | float 32x4 | float 64x2 |
+| Operations\\Types | float 32 | float 64 |
 |-------------------|:-:|:-:|
 |load, store        | x | x |
 |interleave         | x |   |
@@ -410,6 +505,67 @@ typedef v_reg<uint64, 2> v_uint64x2;
 /** @brief Two 64-bit signed integer values */
 typedef v_reg<int64, 2> v_int64x2;
 
+#if CV_SIMD256
+/** @brief Thirty two 8-bit unsigned integer values */
+typedef v_reg<uchar, 32> v_uint8x32;
+/** @brief Thirty two 8-bit signed integer values */
+typedef v_reg<schar, 32> v_int8x32;
+/** @brief Sixteen 16-bit unsigned integer values */
+typedef v_reg<ushort, 16> v_uint16x16;
+/** @brief Sixteen 16-bit signed integer values */
+typedef v_reg<short, 16> v_int16x16;
+/** @brief Eight 32-bit unsigned integer values */
+typedef v_reg<unsigned, 8> v_uint32x8;
+/** @brief Eight 32-bit signed integer values */
+typedef v_reg<int, 8> v_int32x8;
+/** @brief Eight 32-bit floating point values (single precision) */
+typedef v_reg<float, 8> v_float32x8;
+/** @brief Four 64-bit floating point values (double precision) */
+typedef v_reg<double, 4> v_float64x4;
+/** @brief Four 64-bit unsigned integer values */
+typedef v_reg<uint64, 4> v_uint64x4;
+/** @brief Four 64-bit signed integer values */
+typedef v_reg<int64, 4> v_int64x4;
+#endif
+
+#if CV_SIMD512
+/** @brief Sixty four 8-bit unsigned integer values */
+typedef v_reg<uchar, 64> v_uint8x64;
+/** @brief Sixty four 8-bit signed integer values */
+typedef v_reg<schar, 64> v_int8x64;
+/** @brief Thirty two 16-bit unsigned integer values */
+typedef v_reg<ushort, 32> v_uint16x32;
+/** @brief Thirty two 16-bit signed integer values */
+typedef v_reg<short, 32> v_int16x32;
+/** @brief Sixteen 32-bit unsigned integer values */
+typedef v_reg<unsigned, 16> v_uint32x16;
+/** @brief Sixteen 32-bit signed integer values */
+typedef v_reg<int, 16> v_int32x16;
+/** @brief Sixteen 32-bit floating point values (single precision) */
+typedef v_reg<float, 16> v_float32x16;
+/** @brief Eight 64-bit floating point values (double precision) */
+typedef v_reg<double, 8> v_float64x8;
+/** @brief Eight 64-bit unsigned integer values */
+typedef v_reg<uint64, 8> v_uint64x8;
+/** @brief Eight 64-bit signed integer values */
+typedef v_reg<int64, 8> v_int64x8;
+#endif
+
+enum {
+    simd128_width = 16,
+#if CV_SIMD256
+    simd256_width = 32,
+#endif
+#if CV_SIMD512
+    simd512_width = 64,
+    simdmax_width = simd512_width
+#elif CV_SIMD256
+    simdmax_width = simd256_width
+#else
+    simdmax_width = simd128_width
+#endif
+};
+
 /** @brief Add values
 
 For all types. */
@@ -559,27 +715,6 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
     return c; \
 }
 
-//! @brief Helper macro
-//! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
-inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 4; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-} \
-inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 2; i++ ) \
-    { \
-        c.s[i] = cfunc(a.s[i]); \
-        c.s[i + 2] = 0; \
-    } \
-    return c; \
-}
-
 /** @brief Square root of elements
 
 Only for floating point types.*/
@@ -598,26 +733,6 @@ Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
                           typename V_TypeTraits<_Tp>::abs_type)
 
-/** @brief Round elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound)
-
-/** @brief Floor elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor)
-
-/** @brief Ceil elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil)
-
-/** @brief Truncate elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int)
-
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
@@ -855,9 +970,9 @@ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp,
 /** @overload
 
 For 32-bit floating point values */
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
 {
-    v_float32x4 c;
+    v_reg<float, n> c;
     for( int i = 0; i < c.nlanes; i++ )
         c.s[i] = _absdiff(a.s[i], b.s[i]);
     return c;
@@ -866,9 +981,9 @@ inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
 /** @overload
 
 For 64-bit floating point values */
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
 {
-    v_float64x2 c;
+    v_reg<double, n> c;
     for( int i = 0; i < c.nlanes; i++ )
         c.s[i] = _absdiff(a.s[i], b.s[i]);
     return c;
@@ -1238,14 +1353,17 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
  result[3] = d[0] + d[1] + d[2] + d[3]
  @endcode
 */
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
+template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
+    const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
+    v_reg<float, n> r;
+    for(int i = 0; i < (n/4); i++)
+    {
+        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
+        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
+        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
+        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
+    }
     return r;
 }
 
@@ -1459,30 +1577,116 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
 
 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
 
+@note Use vx_load version to get maximum available register length result
+
 @note Alignment requirement:
 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
 }
 
+#if CV_SIMD256
+/** @brief Load 256-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 512-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
 /** @brief Load register contents from memory (aligned)
 
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
- */
+
+@note Use vx_load_aligned version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
 {
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>)>(ptr));
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
 /** @brief Load 64-bits of data to lower part (high part is undefined).
 
 @param ptr memory block containing data for first half (0..n/2)
@@ -1491,14 +1695,16 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 int lo[2] = { 1, 2 };
 v_int32x4 r = v_load_low(lo);
 @endcode
- */
+
+@note Use vx_load_low version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = ptr[i];
@@ -1506,6 +1712,62 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load 128-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 };
+v_int32x8 r = v256_load_low(lo);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 256-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v_int32x16 r = v512_load_low(lo);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from two memory blocks
 
 @param loptr memory block containing data for first half (0..n/2)
@@ -1515,15 +1777,17 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
 v_int32x4 r = v_load_halves(lo, hi);
 @endcode
- */
+
+@note Use vx_load_halves version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(loptr));
     CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = loptr[i];
@@ -1532,6 +1796,68 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr,
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
+v_int32x8 r = v256_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
+v_int32x16 r = v512_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from memory with double expand
 
 Same as cv::v_load, but result pack type will be 2x wider than memory type.
@@ -1540,16 +1866,19 @@ Same as cv::v_load, but result pack type will be 2x wider than memory type.
 short buf[4] = {1, 2, 3, 4}; // type is int16
 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
-For 8-, 16-, 32-bit integer source types. */
+For 8-, 16-, 32-bit integer source types.
+
+@note Use vx_load_expand version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
 v_load_expand(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
+    v_reg<w_type, simd128_width / sizeof(w_type)> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1557,23 +1886,88 @@ v_load_expand(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v256_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
+v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v256_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd256_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v512_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
+v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v512_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd512_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from memory with quad expand
 
 Same as cv::v_load_expand, but result type is 4 times wider than source.
 @code{.cpp}
 char buf[4] = {1, 2, 3, 4}; // type is int8
-v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
+v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
-For 8-bit integer source types. */
+For 8-bit integer source types.
+
+@note Use vx_load_expand_q version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
 v_load_expand_q(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
     typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
+    v_reg<q_type, simd128_width / sizeof(q_type)> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1581,6 +1975,66 @@ v_load_expand_q(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v256_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
+v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v256_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd256_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v512_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
+v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v512_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd512_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load and deinterleave (2 channels)
 
 Load data from memory deinterleave and store to 2 registers.
@@ -1965,9 +2419,11 @@ inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
     return v_reg<_Tp, n>::all(a.s[i]);
 }
 
-/** @brief Round
+/** @brief Round elements
 
-Rounds each value. Input type is float vector ==> output type is int vector.*/
+Rounds each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -1988,9 +2444,11 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const
     return c;
 }
 
-/** @brief Floor
+/** @brief Floor elements
 
-Floor each value. Input type is float vector ==> output type is int vector.*/
+Floor each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -1999,9 +2457,11 @@ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
     return c;
 }
 
-/** @brief Ceil
+/** @brief Ceil elements
 
-Ceil each value. Input type is float vector ==> output type is int vector.*/
+Ceil each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -2010,9 +2470,11 @@ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
     return c;
 }
 
-/** @brief Trunc
+/** @brief Truncate elements
 
-Truncate each value. Input type is float vector ==> output type is int vector.*/
+Truncate each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -2036,7 +2498,7 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
         c.s[i] = cvFloor(a.s[i]);
@@ -2048,7 +2510,7 @@ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
         c.s[i] = cvCeil(a.s[i]);
@@ -2060,10 +2522,10 @@ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
-        c.s[i] = cvCeil(a.s[i]);
+        c.s[i] = (int)(a.s[i]);
         c.s[i+n] = 0;
     }
     return c;
@@ -2071,7 +2533,7 @@ template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 
 /** @brief Convert to float
 
-Supported input type is cv::v_int32x4. */
+Supported input type is cv::v_int32. */
 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
 {
     v_reg<float, n> c;
@@ -2080,6 +2542,9 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
     return c;
 }
 
+/** @brief Convert lower half to float
+
+Supported input type is cv::v_float64. */
 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
 {
     v_reg<float, n*2> c;
@@ -2091,6 +2556,9 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
     return c;
 }
 
+/** @brief Convert to float
+
+Supported input type is cv::v_float64. */
 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
 {
     v_reg<float, n*2> c;
@@ -2102,72 +2570,55 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, co
     return c;
 }
 
-/** @brief Convert to double
+/** @brief Convert lower half to double
 
-Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
         c.s[i] = (double)a.s[i];
     return c;
 }
 
 /** @brief Convert to double high part of vector
 
-Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int, 4>& a)
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
     return c;
 }
 
-/** @brief Convert to double
+/** @brief Convert lower half to double
 
-Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
         c.s[i] = (double)a.s[i];
     return c;
 }
 
 /** @brief Convert to double high part of vector
 
-Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<float, 4>& a)
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
     return c;
 }
 
 /** @brief Convert to double
 
-Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int64, 2>& a)
+Supported input type is cv::v_int64. */
+template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-/** @brief Convert to double high part of vector
-
-Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
-{
-    enum { n = 2 };
     v_reg<double, n> c;
     for( int i = 0; i < n; i++ )
         c.s[i] = (double)a.s[i];
@@ -2175,24 +2626,24 @@ CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
 }
 
 
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i]];
     return c;
 }
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i / 2] + i % 2];
     return c;
 }
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i / 4] + i % 4];
     return c;
 }
@@ -2221,36 +2672,15 @@ template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int,
     return c;
 }
 
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
 {
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, n/2> c;
+    for( int i = 0; i < n/2; i++ )
         c.s[i] = tab[idx.s[i]];
     return c;
 }
 
 
-inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-
 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
                                                v_reg<float, n>& x, v_reg<float, n>& y)
 {
@@ -2330,146 +2760,205 @@ b2  {A3 B3 C3 D3}
 b3  {A4 B4 C4 D4}
 @endcode
 */
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+template<typename _Tp, int n>
+inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
+                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
+                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
 {
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+    for (int i = 0; i < n / 4; i++)
+    {
+        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
+        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
+        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
+        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
+        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
+        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
+        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
+        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
+    }
 }
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
+#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
 
 //! @name Init with zero
 //! @{
 //! @brief Create new vector with zero elements
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
+#endif
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
 
 //! @name Init with value
 //! @{
 //! @brief Create new vector with elements set to a specific value
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
+#endif
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
+template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
+{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
 
 //! @name Reinterpret
 //! @{
 //! @brief Convert vector to different type without modifying underlying data.
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
+{ return a << shift; }
 
 //! @name Left shift
 //! @{
 //! @brief Shift left
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTL(ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(short)
+OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(int)
+OPENCV_HAL_IMPL_C_SHIFTL(uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
+{ return a >> shift; }
 
 //! @name Right shift
 //! @{
 //! @brief Shift right
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTR(ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(short)
+OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(int)
+OPENCV_HAL_IMPL_C_SHIFTR(uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
 { \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
     return c; \
 }
 
 //! @name Rounding shift
 //! @{
 //! @brief Rounding shift right
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(short)
+OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(int)
+OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
     { \
         c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
+        c.s[i+n] = cast<_Tpn>(b.s[i]); \
     } \
     return c; \
 }
@@ -2485,26 +2974,26 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
     { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
     } \
     return c; \
 }
@@ -2520,22 +3009,22 @@ template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    for( int i = 0; i < n; i++ ) \
         ptr[i] = cast<_Tpn>(a.s[i]); \
 }
 
@@ -2550,23 +3039,23 @@ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
 }
 
 //! @name Pack and store with rounding shift
@@ -2580,14 +3069,14 @@ template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @cond IGNORED
@@ -2622,9 +3111,9 @@ b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
 }
 @endcode */
 
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 2*n> mask;
     _pack_b(mask.s, a, b);
     return mask;
 }
@@ -2645,12 +3134,12 @@ d  {0 0xFFFF.. 0 0xFFFF..}
 }
 @endcode */
 
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
+template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
+                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 4*n> mask;
     _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
+    _pack_b(mask.s + 2*n, c, d);
     return mask;
 }
 
@@ -2674,15 +3163,16 @@ h  {0 0xFFFF..}
    0xFF 0 0xFF 0 0 0xFF 0 0xFF
 }
 @endcode */
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
+template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
+                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
+                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
+                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 8*n> mask;
     _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
+    _pack_b(mask.s + 2*n, c, d);
+    _pack_b(mask.s + 4*n, e, f);
+    _pack_b(mask.s + 6*n, g, h);
     return mask;
 }
 //! @}
@@ -2697,71 +3187,109 @@ Scheme:
 {D0 D1 D2 D3} x |V3|
 ====================
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + A3V3,
-R1 = B0V0 + B1V1 + B2V2 + B3V3
+R0 = A0V0 + B0V1 + C0V2 + D0V3,
+R1 = A1V0 + B1V1 + C1V2 + D1V3
 ...
 @endcode
 */
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
+                                const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
+        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
+        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
+        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
+    }
+    return res;
 }
 
 /** @brief Matrix multiplication and add
 
 Scheme:
 @code
-{A0 A1 A2   }   |V0|   |D0|
-{B0 B1 B2   }   |V1|   |D1|
-{C0 C1 C2   } x |V2| + |D2|
-====================
+{A0 A1 A2 A3}   |V0|   |D0|
+{B0 B1 B2 B3}   |V1|   |D1|
+{C0 C1 C2 C3} x |V2| + |D2|
+====================   |D3|
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + D0,
-R1 = B0V0 + B1V1 + B2V2 + D1
+R0 = A0V0 + B0V1 + C0V2 + D0,
+R1 = A1V0 + B1V1 + C1V2 + D1
 ...
 @endcode
 */
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
+                                   const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                   const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
+        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
+        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
+        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
+    }
+    return res;
 }
 
 
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                           const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
 
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_dotprod_expand(a, b); }
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                                const v_reg<double, n/2>& c)
 { return v_dotprod_expand(a, b, c); }
 
 ////// FP16 support ///////
 
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
+inline v_reg<float, simd128_width / sizeof(float)>
 v_load_expand(const float16_t* ptr)
 {
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
+    v_reg<float, simd128_width / sizeof(float)> v;
     for( int i = 0; i < v.nlanes; i++ )
     {
         v.s[i] = ptr[i];
     }
     return v;
 }
+#if CV_SIMD256
+inline v_reg<float, simd256_width / sizeof(float)>
+v256_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd256_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+#if CV_SIMD512
+inline v_reg<float, simd512_width / sizeof(float)>
+v512_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd512_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
 
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+template<int n> inline void
+v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
 {
     for( int i = 0; i < v.nlanes; i++ )
     {
@@ -2770,6 +3298,12 @@ v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>&
 }
 
 inline void v_cleanup() {}
+#if CV_SIMD256
+inline void v256_cleanup() {}
+#endif
+#if CV_SIMD512
+inline void v512_cleanup() {}
+#endif
 
 //! @}
 
@@ -2778,4 +3312,9 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif
 }
 
+#if !defined(CV_DOXYGEN)
+#undef CV_SIMD256
+#undef CV_SIMD512
+#endif
+
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 280691b448..785648575a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif
 
+// The following macro checks if the code is being compiled for the
+// AArch64 execution state of Armv8, to enable the 128-bit
+// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
+// the Arm C Language Extension (ACLE) specifications [1] to check the
+// availability of 128-bit intrinsics, and it is supporrted by clang
+// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
+// Visual Studio [2] .
+//
+// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
+// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+#define CV_NEON_AARCH64 1
+#else
+#define CV_NEON_AARCH64 0
+#endif
+
 // TODO
 #define CV_NEON_DOT 0
 
@@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 // 16 >> 32
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
+#if CV_NEON_AARCH64
+    int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
     int16x4_t a0 = vget_low_s16(a.val);
     int16x4_t a1 = vget_high_s16(a.val);
     int16x4_t b0 = vget_low_s16(b.val);
     int16x4_t b1 = vget_high_s16(b.val);
     int32x4_t p = vmull_s16(a0, b0);
     return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
 }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
+#if CV_NEON_AARCH64
+    int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
     int16x4_t a0 = vget_low_s16(a.val);
     int16x4_t a1 = vget_high_s16(a.val);
     int16x4_t b0 = vget_low_s16(b.val);
     int16x4_t b1 = vget_high_s16(b.val);
     int32x4_t p = vmlal_s16(c.val, a0, b0);
     return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
 }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_NEON_AARCH64
+    int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
     int32x2_t a0 = vget_low_s32(a.val);
     int32x2_t a1 = vget_high_s32(a.val);
     int32x2_t b0 = vget_low_s32(b.val);
     int32x2_t b1 = vget_high_s32(b.val);
     int64x2_t p = vmull_s32(a0, b0);
     return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
+#if CV_NEON_AARCH64
+    int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
     int32x2_t a0 = vget_low_s32(a.val);
     int32x2_t a1 = vget_high_s32(a.val);
     int32x2_t b0 = vget_low_s32(b.val);
     int32x2_t b1 = vget_high_s32(b.val);
     int64x2_t p = vmlal_s32(c.val, a0, b0);
     return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
 }
 
 // 8 >> 32
@@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a)
 #if CV_SIMD128_64F
 inline double v_reduce_sum(const v_float64x2& a)
 {
-    return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
+    return vaddvq_f64(a.val);
 }
 #endif
 
@@ -1503,6 +1539,26 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
 #endif
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_high_##suffix(a.val); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#else
 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 { \
@@ -1521,6 +1577,7 @@ inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
 }
+#endif
 
 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
index eca787c7fd..cb2140df58 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
@@ -2,309 +2,2316 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+// The original implementation has been contributed by Yin Zhang.
+// Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
+
 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
 #define OPENCV_HAL_INTRIN_RVV_HPP
 
-#include <limits>
-#include <cstring>
 #include <algorithm>
-#include "opencv2/core/saturate.hpp"
-
-#define CV_SIMD128_CPP 1
-#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN)
-#define CV_SIMD128 1
-#define CV_SIMD128_64F 1
-#endif
 
 namespace cv
 {
 
-#ifndef CV_DOXYGEN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-#endif
 
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
 
-template<typename _Tp, int n> struct v_reg
+//////////// Unsupported native intrinsics in C++ ////////////
+
+struct vuint8mf2_t
 {
-    typedef _Tp lane_type;
-    enum { nlanes = n };
-
-    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
-
-    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    uchar val[8] = {0};
+    vuint8mf2_t() {}
+    vuint8mf2_t(const uchar* ptr)
     {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-    }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
-           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
-           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
-        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
-    }
-
-    v_reg() {}
-
-    v_reg(const v_reg<_Tp, n> & r)
-    {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-    }
-    _Tp get0() const { return s[0]; }
-
-    _Tp get(const int i) const { return s[i]; }
-    v_reg<_Tp, n> high() const
-    {
-        v_reg<_Tp, n> c;
-        int i;
-        for( i = 0; i < n/2; i++ )
+        for (int i = 0; i < 8; ++i)
         {
-            c.s[i] = s[i+(n/2)];
-            c.s[i+(n/2)] = 0;
+            val[i] = ptr[i];
         }
-        return c;
     }
-
-    static v_reg<_Tp, n> zero()
+};
+struct vint8mf2_t
+{
+    schar val[8] = {0};
+    vint8mf2_t() {}
+    vint8mf2_t(const schar* ptr)
     {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = (_Tp)0;
-        return c;
+        for (int i = 0; i < 8; ++i)
+        {
+            val[i] = ptr[i];
+        }
     }
-
-    static v_reg<_Tp, n> all(_Tp s)
+};
+struct vuint16mf2_t
+{
+    ushort val[4] = {0};
+    vuint16mf2_t() {}
+    vuint16mf2_t(const ushort* ptr)
     {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = s;
-        return c;
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
     }
-
-    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+};
+struct vint16mf2_t
+{
+    short val[4] = {0};
+    vint16mf2_t() {}
+    vint16mf2_t(const short* ptr)
     {
-        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
-        v_reg<_Tp2, n2> c;
-        std::memcpy(&c.s[0], &s[0], bytes);
-        return c;
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
     }
-
-    v_reg& operator=(const v_reg<_Tp, n> & r)
+};
+struct vuint32mf2_t
+{
+    unsigned val[2] = {0};
+    vuint32mf2_t() {}
+    vuint32mf2_t(const unsigned* ptr)
     {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-        return *this;
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vint32mf2_t
+{
+    int val[2] = {0};
+    vint32mf2_t() {}
+    vint32mf2_t(const int* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vfloat32mf2_t
+{
+    float val[2] = {0};
+    vfloat32mf2_t() {}
+    vfloat32mf2_t(const float* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vuint64mf2_t
+{
+    uint64 val[1] = {0};
+    vuint64mf2_t() {}
+    vuint64mf2_t(const uint64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vint64mf2_t
+{
+    int64 val[1] = {0};
+    vint64mf2_t() {}
+    vint64mf2_t(const int64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vfloat64mf2_t
+{
+    double val[1] = {0};
+    vfloat64mf2_t() {}
+    vfloat64mf2_t(const double* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vuint8mf4_t
+{
+    uchar val[4] = {0};
+    vuint8mf4_t() {}
+    vuint8mf4_t(const uchar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint8mf4_t
+{
+    schar val[4] = {0};
+    vint8mf4_t() {}
+    vint8mf4_t(const schar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
     }
-
-    _Tp s[n];
 };
 
-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<uint64, 2> v_uint64x2;
-typedef v_reg<int64, 2> v_int64x2;
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
-
-
-#ifndef CV_DOXYGEN
-
-#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
-__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(short, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(int, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
-
-#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
-__CV_EXPAND(macro_name(float, __VA_ARGS__)) \
-__CV_EXPAND(macro_name(double, __VA_ARGS__)) \
-
-#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
-
-#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
-template<int n> inline \
-v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
+inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
 { \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return c; \
+    return _Tpvec(ptr); \
 } \
-template<int n> inline \
-v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
 { \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
+    for (int i = 0; i < n; ++i) \
+    { \
+            ptr[i] = v.val[i]; \
+    } \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
 
-CV__HAL_INTRIN_IMPL_BIN_OP(+)
-CV__HAL_INTRIN_IMPL_BIN_OP(-)
-CV__HAL_INTRIN_IMPL_BIN_OP(*)
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
-template<int n> CV_INLINE \
-v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
+inline _Tpwvec wcvt (_Tpvec v) \
 { \
-    v_reg<_Tp, n> c; \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return c; \
-} \
-template<int n> CV_INLINE \
-v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
+    _wTp tmp[n]; \
+    for (int i = 0; i < n; ++i) \
+    { \
+            tmp[i] = (_wTp)v.val[i]; \
+    } \
+    vsetvlmax_e##width##m1(); \
+    return vle##width##_v_##suffix##m1(tmp); \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
 
+inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
+{
+    return vuint8mf4_t(base);
+}
+inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base)
+{
+    return vint8mf4_t(base);
+}
 
-CV__HAL_INTRIN_IMPL_BIT_OP(&)
-CV__HAL_INTRIN_IMPL_BIT_OP(|)
-CV__HAL_INTRIN_IMPL_BIT_OP(^)
+inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
+{
+    ushort tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (ushort)src.val[i];
+    }
+    return vle16_v_u16mf2(tmp);
+}
+inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
+{
+    short tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (short)src.val[i];
+    }
+    return vle16_v_i16mf2(tmp);
+}
 
-#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
-template<int n> CV_INLINE \
-v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
-    return c; \
-} \
+//////////// Types ////////////
 
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
 
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v)
+    {
+        vsetvlmax_e8m1();
+        vse8_v_u8m1(val, v);
+    }
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint8m1_t() const
+    {
+        vsetvlmax_e8m1();
+        return vle8_v_u8m1(val);
+    }
+    uchar get0() const
+    {
+        return val[0];
+    }
+
+    uchar val[16];
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v)
+    {
+        vsetvlmax_e8m1();
+        vse8_v_i8m1(val, v);
+    }
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint8m1_t() const
+    {
+        vsetvlmax_e8m1();
+        return vle8_v_i8m1(val);
+    }
+    schar get0() const
+    {
+        return val[0];
+    }
+
+    schar val[16];
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v)
+    {
+        vsetvlmax_e16m1();
+        vse16_v_u16m1(val, v);
+    }
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint16m1_t() const
+    {
+        vsetvlmax_e16m1();
+        return vle16_v_u16m1(val);
+    }
+    ushort get0() const
+    {
+        return val[0];
+    }
+
+    ushort val[8];
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v)
+    {
+        vsetvlmax_e16m1();
+        vse16_v_i16m1(val, v);
+    }
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint16m1_t() const
+    {
+        vsetvlmax_e16m1();
+        return vle16_v_i16m1(val);
+    }
+    short get0() const
+    {
+        return val[0];
+    }
+
+    short val[8];
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v)
+    {
+        vsetvlmax_e32m1();
+        vse32_v_u32m1(val, v);
+    }
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint32m1_t() const
+    {
+        vsetvlmax_e32m1();
+        return vle32_v_u32m1(val);
+    }
+    unsigned get0() const
+    {
+        return val[0];
+    }
+
+    unsigned val[4];
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v)
+    {
+        vsetvlmax_e32m1();
+        vse32_v_i32m1(val, v);
+    }
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint32m1_t() const
+    {
+        vsetvlmax_e32m1();
+        return vle32_v_i32m1(val);
+    }
+    int get0() const
+    {
+        return val[0];
+    }
+    int val[4];
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v)
+    {
+        vsetvlmax_e32m1();
+        vse32_v_f32m1(val, v);
+    }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat32m1_t() const
+    {
+        vsetvlmax_e32m1();
+        return vle32_v_f32m1(val);
+    }
+    float get0() const
+    {
+        return val[0];
+    }
+    float val[4];
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v)
+    {
+        vsetvlmax_e64m1();
+        vse64_v_u64m1(val, v);
+    }
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint64m1_t() const
+    {
+        vsetvlmax_e64m1();
+        return vle64_v_u64m1(val);
+    }
+    uint64 get0() const
+    {
+        return val[0];
+    }
+
+    uint64 val[2];
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v)
+    {
+        vsetvlmax_e64m1();
+        vse64_v_i64m1(val, v);
+    }
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint64m1_t() const
+    {
+        vsetvlmax_e64m1();
+        return vle64_v_i64m1(val);
+    }
+    int64 get0() const
+    {
+        return val[0];
+    }
+
+    int64 val[2];
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v)
+    {
+        vsetvlmax_e64m1();
+        vse64_v_f64m1(val, v);
+    }
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat64m1_t() const
+    {
+        vsetvlmax_e64m1();
+        return vle64_v_f64m1(val);
+    }
+    double get0() const
+    {
+        return val[0];
+    }
+
+    double val[2];
+};
 #endif
 
 
-#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp2, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-}
+//////////// Initial ////////////
 
-#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
-inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
+inline v_##_Tpvec v_setzero_##suffix1() \
 { \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 4; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
+    vsetvlmax_e##width##m1(); \
+    return v_##_Tpvec(vzero_##suffix2##m1()); \
 } \
-inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
 { \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 2; i++ ) \
+    vsetvlmax_e##width##m1(); \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return v_##_Tpv(vzero_##suffix##m1()); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64)
+#endif
+
+//////////// Reinterpret ////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
+inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
+
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    vsetvlmax_e##width2##m1(); \
+    return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    vsetvlmax_e##width1##m1(); \
+    return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
+#endif
+
+////////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64)
+#endif
+
+////////////// Load/Store //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    vsetvlmax_e8m1(); \
+    return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    vsetvl_e##width##m1(hvl); \
+    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
+    vsetvlmax_e##width##m1(); \
+    return res; \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vsetvlmax_e8m1(); \
+    vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    vse##width##_v_##suffix##m1(ptr, a); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    vse##width##_v_##suffix##m1(ptr, a); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    vse##width##_v_##suffix##m1(ptr, a); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
+    vsetvlmax_e##width##m1(); \
+    vse##width##_v_##suffix##m1(tmp_ptr, a); \
+    for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
     { \
-        c.s[i] = cfunc(a.s[i]); \
-        c.s[i + 2] = 0; \
+        ptr[i] = tmp_ptr[i]; \
     } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
-                          typename V_TypeTraits<_Tp>::abs_type)
-
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound)
-
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor)
-
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil)
-
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int)
-
-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i], b.s[i]); \
-    return c; \
+    _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
+    vsetvlmax_e##width##m1(); \
+    vse##width##_v_##suffix##m1(tmp_ptr, a); \
+    for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
+    { \
+        ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
+    } \
 }
 
-#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64)
+#endif
+
+inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
+        ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
+    };
+    vsetvlmax_e8m1();
+    return v_int8x16(vle8_v_i8m1(elems));
+}
+inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
+
+inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
+    };
+    vsetvlmax_e16m1();
+    return v_int16x8(vle16_v_i16m1(elems));
+}
+inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
+
+inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    vsetvlmax_e32m1();
+    return v_int32x4(vle32_v_i32m1(elems));
+}
+inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(elems));
+}
+inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
+
+inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
+{
+    int64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    vsetvlmax_e64m1();
+    return v_int64x2(vle64_v_i64m1(elems));
+}
+inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+#endif
+
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    vsetvlmax_e8m1();
+    return v_int8x16(vle8_v_i8m1(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    vsetvlmax_e8m1();
+    return v_int8x16(vle8_v_i8m1(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    vsetvlmax_e8m1();
+    return v_int8x16(vle8_v_i8m1(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    vsetvlmax_e16m1();
+    return v_int16x8(vle16_v_i16m1(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    vsetvlmax_e16m1();
+    return v_int16x8(vle16_v_i16m1(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    vsetvlmax_e16m1();
+    return v_int16x8(vle16_v_i16m1(elems));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    vsetvlmax_e32m1();
+    return v_int32x4(vle32_v_i32m1(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    vsetvlmax_e32m1();
+    return v_int32x4(vle32_v_i32m1(elems));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    vsetvlmax_e32m1();
+    return v_int32x4(vle32_v_i32m1(tab + idx[0]));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    int64_t CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    vsetvlmax_e64m1();
+    return v_int64x2(vle64_v_i64m1(elems));
+}
+inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
+{
+    vsetvlmax_e64m1();
+    return v_int64x2(vle64_v_i64m1(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(elems));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    vsetvlmax_e32m1();
+    return v_int32x4(vle32_v_i32m1(elems));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    vsetvlmax_e32m1();
+    return v_uint32x4(vle32_v_u32m1(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4] = {0};
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+////////////// Pack boolean ////////////////////
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 8, b);
+    vsetvlmax_e8m1();
+    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 4, b);
+    v_store(ptr + 8, c);
+    v_store(ptr + 12, d);
+    vsetvlmax_e8m1();
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 2, b);
+    v_store(ptr + 4, c);
+    v_store(ptr + 6, d);
+    v_store(ptr + 8, e);
+    v_store(ptr + 10, f);
+    v_store(ptr + 12, g);
+    v_store(ptr + 14, h);
+    vsetvlmax_e8m1();
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
+}
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tp c = a.s[0]; \
-    for( int i = 1; i < n; i++ ) \
-        c = cfunc(c, a.s[i]); \
-    return c; \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(intrin(a, b)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    a = _Tpvec(intrin(a, b)); \
+    return a; \
 }
 
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64)
+#endif
 
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
 
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
+////////////// Bitwise logic //////////////
 
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vnot_v_##suffix##m1(a)); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64)
+
+#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    vsetvlmax_e32m1(); \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    vsetvlmax_e32m1(); \
+    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
+}
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    vsetvlmax_e64m1(); \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    vsetvlmax_e64m1(); \
+    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
+}
+#endif
+
+////////////// Bitwise shifts //////////////
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64)
+
+
+////////////// Comparison //////////////
+
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64)
+#endif
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return a == a; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return a == a; }
+#endif
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(intrin(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64)
+#endif
+
+////////////// Arithmetics wrap //////////////
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
+
+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    vsetvlmax_e##wwidth##m1(); \
+    _nwTpvec zero = vzero_##wsuffix##m1(); \
+    _nwTpvec res = vzero_##wsuffix##m1(); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    vsetvlmax_e##width##m1(); \
+    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
+    return scalartype(res.get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax)
+
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        v_reduce_sum(a),
+        v_reduce_sum(b),
+        v_reduce_sum(c),
+        v_reduce_sum(d)
+    };
+    vsetvlmax_e32m1();
+    return v_float32x4(vle32_v_f32m1(elems));
+}
+
+////////////// Square-Root //////////////
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vfsqrt_v_f32m1(x));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    v_float32x4 one = v_setall_f32(1.0f);
+    return one / v_sqrt(x);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vfsqrt_v_f64m1(x));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    v_float64x2 one = v_setall_f64(1.0f);
+    return one / v_sqrt(x);
+}
+#endif
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    vsetvlmax_e32m1();
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    vsetvlmax_e64m1();
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vfmacc_vv_f32m1(c, a, b));
+}
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    vsetvlmax_e32m1();
+    return v_int32x4(vmacc_vv_i32m1(c, a, b));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vfmacc_vv_f64m1(c, a, b));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
+    return (v.val[0] | v.val[1]) == 0; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
+    return (v.val[0] | v.val[1]) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64)
+
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#endif
+
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_max(a, b) - v_min(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
+
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(merge(ne(mask, 0), b, a)); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
+OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vsetvlmax_e##width##m1(); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64)
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64)
+#endif
+
+////////////// Convert to float //////////////
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    vsetvlmax_e32m1();
+    return v_float32x4(vfcvt_f_x_v_f32m1(a));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    vsetvlmax_e32m1();
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    vsetvlmax_e32m1();
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    vsetvlmax_e64m1();
+    return v_float64x2(vle64_v_f64m1(elems));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    vsetvlmax_e64m1();
+    return v_float64x2(vfcvt_f_x_v_f64m1(a));
+}
+#endif
+
+////////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n<i>(v)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
+#endif
+
+////////////// Transpose4x4 //////////////
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) elems0[4] = \
+    { \
+        v_extract_n<0>(a0), \
+        v_extract_n<0>(a1), \
+        v_extract_n<0>(a2), \
+        v_extract_n<0>(a3) \
+    }; \
+    b0 = v_load(elems0); \
+    _Tp CV_DECL_ALIGNED(32) elems1[4] = \
+    { \
+        v_extract_n<1>(a0), \
+        v_extract_n<1>(a1), \
+        v_extract_n<1>(a2), \
+        v_extract_n<1>(a3) \
+    }; \
+    b1 = v_load(elems1); \
+    _Tp CV_DECL_ALIGNED(32) elems2[4] = \
+    { \
+        v_extract_n<2>(a0), \
+        v_extract_n<2>(a1), \
+        v_extract_n<2>(a2), \
+        v_extract_n<2>(a3) \
+    }; \
+    b2 = v_load(elems2); \
+    _Tp CV_DECL_ALIGNED(32) elems3[4] = \
+    { \
+        v_extract_n<3>(a0), \
+        v_extract_n<3>(a1), \
+        v_extract_n<3>(a2), \
+        v_extract_n<3>(a3) \
+    }; \
+    b3 = v_load(elems3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
+
+////////////// Reverse //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+    { \
+        ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64)
+#endif
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    v_store_high(hptr, a); \
+    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
+    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_high(hptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    vsetvlmax_e32m1();
+    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vsetvlmax_e32m1();
+    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
+}
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    vsetvlmax_e##width##m2(); \
+    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
+    vsetvlmax_e##width##m2(); \
+    v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    vsetvlmax_e##width##m2(); \
+    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
+} \
+template<int n> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
+    vsetvlmax_e##width##m2(); \
+    v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
+inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    vsetvlmax_e##width##m2(); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
+} \
+inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
+    vsetvlmax_e##width##m2(); \
+    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    vsetvlmax_e##width##m2(); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
+    vsetvlmax_e##width##m2(); \
+    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
+
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra0, a0); \
+    v_store(ptra1, a1); \
+    int i; \
+    for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
+    { \
+        ptrb0[i*2] = ptra0[i]; \
+        ptrb0[i*2+1] = ptra1[i]; \
+    } \
+    for( ; i < v_##_Tpvec::nlanes; i++ ) \
+    { \
+        ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
+        ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
+    } \
+    b0 = v_load(ptrb0); \
+    b1 = v_load(ptrb1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_low(ptra, a); \
+    v_store_low(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_high(ptra, a); \
+    v_store_high(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    int i, i2; \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptra[i] = ptr[i2]; \
+        ptrb[i] = ptr[i2+1]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    int i, i3; \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptra[i] = ptr[i3]; \
+        ptrb[i] = ptr[i3+1]; \
+        ptrc[i] = ptr[i3+2]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    int i, i4; \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptra[i] = ptr[i4]; \
+        ptrb[i] = ptr[i4+1]; \
+        ptrc[i] = ptr[i4+2]; \
+        ptrd[i] = ptr[i4+3]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+    d = v_load(ptrd); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i2; \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptr[i2] = ptra[i]; \
+        ptr[i2+1] = ptrb[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i3; \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptr[i3] = ptra[i]; \
+        ptr[i3+1] = ptrb[i]; \
+        ptr[i3+2] = ptrc[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    int i, i4; \
+    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    v_store(ptrd, d); \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptr[i4] = ptra[i]; \
+        ptr[i4+1] = ptrb[i]; \
+        ptr[i4+2] = ptrc[i]; \
+        ptr[i4+3] = ptrd[i]; \
+    } \
+} \
+inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
+    { \
+        ptr[4*i  ] = ptrvec[4*i  ]; \
+        ptr[4*i+1] = ptrvec[4*i+2]; \
+        ptr[4*i+2] = ptrvec[4*i+1]; \
+        ptr[4*i+3] = ptrvec[4*i+3]; \
+    } \
+    return v_load(ptr); \
+} \
+inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
+{ \
+    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
+    { \
+        ptr[8*i  ] = ptrvec[4*i  ]; \
+        ptr[8*i+1] = ptrvec[4*i+4]; \
+        ptr[8*i+2] = ptrvec[4*i+1]; \
+        ptr[8*i+3] = ptrvec[4*i+5]; \
+        ptr[8*i+4] = ptrvec[4*i+2]; \
+        ptr[8*i+5] = ptrvec[4*i+6]; \
+        ptr[8*i+6] = ptrvec[4*i+3]; \
+        ptr[8*i+7] = ptrvec[4*i+7]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64)
+#endif
+
+//////////// PopCount ////////////
 
 static const unsigned char popCountTable[] =
 {
@@ -325,1354 +2332,571 @@ static const unsigned char popCountTable[] =
     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
 };
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
-    for (int i = 0; i < n*(int)sizeof(_Tp); i++)
-        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
-    return b;
-}
 
-
-template<typename _Tp, int n>
-inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        minval.s[i] = std::min(a.s[i], b.s[i]);
-        maxval.s[i] = std::max(a.s[i], b.s[i]);
-    }
-}
-
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
+inline _rTpvec v_popcount(const _Tpvec& a) \
 { \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
-    return c; \
+    uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
+    v_store(ptra, v_reinterpret_as_u8(a)); \
+    _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_setzero_##suffix()); \
+    for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
+        ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
+    return v_load(ptr); \
 }
 
-OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
 
-OPENCV_HAL_IMPL_CMP_OP(>)
+//////////// SignMask ////////////
 
-OPENCV_HAL_IMPL_CMP_OP(<=)
-
-OPENCV_HAL_IMPL_CMP_OP(>=)
-
-OPENCV_HAL_IMPL_CMP_OP(==)
-
-OPENCV_HAL_IMPL_CMP_OP(!=)
-
-template<int n>
-inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
-{
-    typedef typename V_TypeTraits<float>::int_type itype;
-    v_reg<float, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-template<int n>
-inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
-{
-    typedef typename V_TypeTraits<double>::int_type itype;
-    v_reg<double, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
+inline int v_signmask(const _Tpvec& a) \
 { \
-    typedef _Tp2 rtype; \
-    v_reg<rtype, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
-    return c; \
+    int mask = 0; \
+    vsetvlmax_e##width##m1(); \
+    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        mask |= (int)(tmp.val[i]) << i; \
+    return mask; \
 }
 
-OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63)
 
-OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#endif
 
-OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
 
-template<typename T> inline T _absdiff(T a, T b)
-{
-    return a > b ? a - b : b - a;
-}
+//////////// Scan forward ////////////
 
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
-{
-    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
-    v_reg<rtype, n> c;
-    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
-    for( int i = 0; i < n; i++ )
-    {
-        rtype ua = a.s[i] ^ mask;
-        rtype ub = b.s[i] ^ mask;
-        c.s[i] = _absdiff(ua, ub);
-    }
-    return c;
-}
-
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{
-    v_float64x2 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++)
-        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = 1.f/std::sqrt(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                           const v_reg<_Tp, n>& c)
-{
-    v_reg<_Tp, n> d;
-    for( int i = 0; i < n; i++ )
-        d.s[i] = a.s[i]*b.s[i] + c.s[i];
-    return d;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
-{
-    return v_fma(a, b, c);
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> c;
-    for( int i = 0; i < (n/2); i++ )
-        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> s;
-    for( int i = 0; i < (n/2); i++ )
-        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{ return v_dotprod(a, b); }
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
-{ return v_dotprod(a, b, c); }
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{ return v_dotprod_expand(a, b); }
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
-{ return v_dotprod_expand(a, b, c); }
-
-template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i]*b.s[i];
-        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
-                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
-    }
-}
-
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
+inline int v_scan_forward(const _Tpvec& a) \
 { \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
-    return c; \
+    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_reinterpret_as_##suffix(a)); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        if(int(ptr[i]) < 0) \
+            return i; \
+    return 0; \
 }
 
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
+#endif
 
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
+//////////// Pack triplets ////////////
 
-#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
+#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
+inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
 { \
-    v_reg<_Tp, n> b; \
-    for (int i = 0; i < n; i++) \
+    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < _Tpvec::nlanes/4; i++) \
     { \
-        int sIndex = i opA imm; \
-        if (0 <= sIndex && sIndex < n) \
-        { \
-            b.s[i] = a.s[sIndex]; \
-        } \
-        else \
-        { \
-            b.s[i] = 0; \
-        } \
+        ptr[3*i  ] = ptrvec[4*i  ]; \
+        ptr[3*i+1] = ptrvec[4*i+2]; \
+        ptr[3*i+2] = ptrvec[4*i+2]; \
     } \
-    return b; \
-} \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int aIndex = i opA imm; \
-        int bIndex = i opA imm opB n; \
-        if (0 <= bIndex && bIndex < n) \
-        { \
-            c.s[i] = b.s[bIndex]; \
-        } \
-        else if (0 <= aIndex && aIndex < n) \
-        { \
-            c.s[i] = a.s[aIndex]; \
-        } \
-        else \
-        { \
-            c.s[i] = 0; \
-        } \
-    } \
-    return c; \
+    return v_load(ptr); \
 }
 
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
 
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
 
-template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+////// FP16 support ///////
+
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
-    for( int i = 1; i < n; i++ )
-        c += a.s[i];
-    return c;
+    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
 }
 
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
-    return r;
+    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
 }
 
-template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
-    for (int i = 1; i < n; i++)
-        c += _absdiff(a.s[i], b.s[i]);
-    return c;
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
 }
-
-template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
-{
-    int mask = 0;
-    for( int i = 0; i < n; i++ )
-        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
-    return mask;
-}
-
-template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
-{
-    for (int i = 0; i < n; i++)
-        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
-            return i;
-    return 0;
-}
-
-template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
-            return false;
-    return true;
-}
-
-template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
-            return true;
-    return false;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
-                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef V_TypeTraits<_Tp> Traits;
-    typedef typename Traits::int_type int_type;
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        int_type m = Traits::reinterpret_int(mask.s[i]);
-        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
-        c.s[i] = m ? a.s[i] : b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        b0.s[i] = a.s[i];
-        b1.s[i] = a.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_low(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_high(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i+(n/2)];
-    return b;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
-                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
-{
-    int i;
-    for( i = 0; i < n/2; i++ )
-    {
-        b0.s[i*2] = a0.s[i];
-        b0.s[i*2+1] = a1.s[i];
-    }
-    for( ; i < n; i++ )
-    {
-        b1.s[i*2-n] = a0.s[i];
-        b1.s[i*2-n+1] = a1.s[i];
-    }
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+
+////////////// Rounding //////////////
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    vsetvlmax_e32m1();
+    return v_int32x4(vfcvt_x_f_v_i32m1(a));
 }
 
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
+inline v_int32x4 v_floor(const v_float32x4& a)
 {
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>)>(ptr));
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a - ZP5;
+    vsetvlmax_e32m1();
+    return v_int32x4(vfcvt_x_f_v_i32m1(t));
 }
 
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
+inline v_int32x4 v_ceil(const v_float32x4& a)
 {
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a + ZP5;
+    vsetvlmax_e32m1();
+    return v_int32x4(vfcvt_x_f_v_i32m1(t));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    vsetvlmax_e32m1();
+    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
+}
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vsetvlmax_e64m2();
+    vfloat64m2_t tmp = vle64_v_f64m2(arr);
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
+}
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
+
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vsetvlmax_e32m2();
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vsetvlmax_e32m2();
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
 }
 
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
-    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vsetvlmax_e64m2();
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vsetvlmax_e64m2();
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vsetvlmax_e32m4();
+    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vsetvlmax_e32m4();
+    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vsetvlmax_e32m4();
+    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vsetvlmax_e32m4();
+    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vsetvlmax_e64m4();
+    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vsetvlmax_e64m4();
+    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vsetvlmax_e64m4();
+    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vsetvlmax_e64m4();
+    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = loptr[i];
-        c.s[i+c.nlanes/2] = hiptr[i];
-    }
-    return c;
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e32m2();
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e32m2();
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2 + c;
 }
 
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
-v_load_expand(const _Tp* ptr)
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    vsetvlmax_e64m2();
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2 + c;
+}
+
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    vsetvlmax_e32m4();
+    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    vsetvlmax_e32m4();
+    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    vsetvlmax_e32m4();
+    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    vsetvlmax_e32m4();
+    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e64m4();
+    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e64m4();
+    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e64m4();
+    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    vsetvlmax_e64m4();
+    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
 #endif
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
 
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
-v_load_expand_q(const _Tp* ptr)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        a.s[i] = ptr[i2];
-        b.s[i] = ptr[i2+1];
-    }
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        a.s[i] = ptr[i3];
-        b.s[i] = ptr[i3+1];
-        c.s[i] = ptr[i3+2];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
-                                v_reg<_Tp, n>& d)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        a.s[i] = ptr[i4];
-        b.s[i] = ptr[i4+1];
-        c.s[i] = ptr[i4+2];
-        d.s[i] = ptr[i4+3];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b,
-                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        ptr[i2] = a.s[i];
-        ptr[i2+1] = b.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        ptr[i3] = a.s[i];
-        ptr[i3+1] = b.s[i];
-        ptr[i3+2] = c.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d,
-                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        ptr[i4] = a.s[i];
-        ptr[i4+1] = b.s[i];
-        ptr[i4+2] = c.s[i];
-        ptr[i4+3] = d.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    v_store(ptr, a);
-}
-
-template<typename _Tp, int n>
-inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
-#endif
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i+(n/2)];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
-    v_store(ptr, a);
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
-    v_store(ptr, a);
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
-{
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
-    v_store(ptr, a);
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i];
-        c.s[i+(n/2)] = b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i+(n/2)];
-        c.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        low.s[i] = a.s[i];
-        low.s[i+(n/2)] = b.s[i];
-        high.s[i] = a.s[i+(n/2)];
-        high.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[n-i-1];
-    return c;
-}
-
-template<int s, typename _Tp, int n>
-inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> r;
-    const int shift = n - s;
-    int i = 0;
-    for (; i < shift; ++i)
-        r.s[i] = a.s[i+s];
-    for (; i < n; ++i)
-        r.s[i] = b.s[i-shift];
-    return r;
-}
-
-template<int s, typename _Tp, int n>
-inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
-{
-    CV_DbgAssert(s >= 0 && s < n);
-    return v.s[s];
-}
-
-template<int i, typename _Tp, int n>
-inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
-{
-    CV_DbgAssert(i >= 0 && i < n);
-    return v_reg<_Tp, n>::all(a.s[i]);
-}
-
-template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvRound(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = cvRound(b.s[i]);
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvFloor(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvCeil(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (int)(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvFloor(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (float)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = (float)b.s[i];
-    }
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int, 4>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<float, 4>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int64, 2>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i]];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 2] + i % 2];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 4] + i % 4];
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-
-inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-
-template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
-                                               v_reg<float, n>& x, v_reg<float, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
-                                               v_reg<double, n>& x, v_reg<double, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[4*i  ] = vec.s[4*i  ];
-        c.s[4*i+1] = vec.s[4*i+2];
-        c.s[4*i+2] = vec.s[4*i+1];
-        c.s[4*i+3] = vec.s[4*i+3];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/8; i++)
-    {
-        c.s[8*i  ] = vec.s[8*i  ];
-        c.s[8*i+1] = vec.s[8*i+4];
-        c.s[8*i+2] = vec.s[8*i+1];
-        c.s[8*i+3] = vec.s[8*i+5];
-        c.s[8*i+4] = vec.s[8*i+2];
-        c.s[8*i+5] = vec.s[8*i+6];
-        c.s[8*i+6] = vec.s[8*i+3];
-        c.s[8*i+7] = vec.s[8*i+7];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[3*i  ] = vec.s[4*i  ];
-        c.s[3*i+1] = vec.s[4*i+1];
-        c.s[3*i+2] = vec.s[4*i+2];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
-{
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
-}
-
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
-
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
-
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
-    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
-
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
-
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
-
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>(a.s[i]); \
-}
-
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-template<typename _Tpm, typename _Tp, int n>
-inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    for (int i = 0; i < n; ++i)
-    {
-        mptr[i] = (_Tpm)a.s[i];
-        mptr[i + n] = (_Tpm)b.s[i];
-    }
-}
-
-
-
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    return mask;
-}
-
-
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
-    return mask;
-}
 
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+    vsetvlmax_e32m1();
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
+    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
+    return v_float32x4(res);
 }
 
 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
+                               const v_float32x4& a)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
+    vsetvlmax_e32m1();
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
+    return v_float32x4(res) + a;
+}
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
+    vsetvlmax_e##width##m2(); \
+    vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
+    vsetvlmax_e##width##m1(); \
+    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
+    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64)
+
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    vsetvlmax_e16m1();
+    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vsetvlmax_e16m1();
+    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
 }
 
 
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
+//////// Saturating Multiply ////////
 
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
-{ return v_dotprod_expand(a, b); }
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b, c); }
-
-////// FP16 support ///////
-
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
-v_load_expand(const float16_t* ptr)
-{
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        v.s[i] = ptr[i];
-    }
-    return v;
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _wTpvec c, d; \
+    v_mul_expand(a, b, c, d); \
+    return v_pack(c, d); \
+} \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a = a * b; \
+    return a; \
 }
 
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
-{
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        ptr[i] = float16_t(v.s[i]);
-    }
-}
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
+
 
 inline void v_cleanup() {}
 
-
-#ifndef CV_DOXYGEN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-#endif
+
+
 }
 
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index ef928f6a5c..b4178af8b7 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -257,221 +257,20 @@ struct v_float64x2
     v128_t val;
 };
 
-namespace fallback
+namespace
 {
-
-template<typename _Tp, int n> struct v_reg
-{
-    typedef _Tp lane_type;
-    enum { nlanes = n };
-
-    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
-
-    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-    }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
-           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
-           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
-        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
-    }
-
-    v_reg() {}
-
-    v_reg(const v_reg<_Tp, n> & r)
-    {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-    }
-
-    _Tp get0() const { return s[0]; }
-
-    _Tp get(const int i) const { return s[i]; }
-    v_reg<_Tp, n> high() const
-    {
-        v_reg<_Tp, n> c;
-        int i;
-        for( i = 0; i < n/2; i++ )
-        {
-            c.s[i] = s[i+(n/2)];
-            c.s[i+(n/2)] = 0;
-        }
-        return c;
-    }
-
-    static v_reg<_Tp, n> zero()
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = (_Tp)0;
-        return c;
-    }
-
-    static v_reg<_Tp, n> all(_Tp s)
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = s;
-        return c;
-    }
-
-    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
-    {
-        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
-        v_reg<_Tp2, n2> c;
-        std::memcpy(&c.s[0], &s[0], bytes);
-        return c;
-    }
-
-    v_reg(const cv::v_uint8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int64x2& v) { wasm_v128_store(&s, v.val); }
-
-    operator cv::v_uint8x16() const { return cv::v_uint8x16(wasm_v128_load(&s)); }
-    operator cv::v_int8x16() const { return cv::v_int8x16(wasm_v128_load(&s)); }
-    operator cv::v_uint16x8() const { return cv::v_uint16x8(wasm_v128_load(&s)); }
-    operator cv::v_int16x8() const { return cv::v_int16x8(wasm_v128_load(&s)); }
-    operator cv::v_uint32x4() const { return cv::v_uint32x4(wasm_v128_load(&s)); }
-    operator cv::v_int32x4() const { return cv::v_int32x4(wasm_v128_load(&s)); }
-    operator cv::v_float32x4() const { return cv::v_float32x4(wasm_v128_load(&s)); }
-    operator cv::v_float64x2() const { return cv::v_float64x2(wasm_v128_load(&s)); }
-    operator cv::v_uint64x2() const { return cv::v_uint64x2(wasm_v128_load(&s)); }
-    operator cv::v_int64x2() const { return cv::v_int64x2(wasm_v128_load(&s)); }
-
-    _Tp s[n];
-};
-
-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<uint64, 2> v_uint64x2;
-typedef v_reg<int64, 2> v_int64x2;
-
-#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> \
-    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& \
-    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIN_OP(+)
-OPENCV_HAL_IMPL_BIN_OP(-)
-OPENCV_HAL_IMPL_BIN_OP(*)
-OPENCV_HAL_IMPL_BIN_OP(/)
-
-#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
-    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
-    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIT_OP(&)
-OPENCV_HAL_IMPL_BIT_OP(|)
-OPENCV_HAL_IMPL_BIT_OP(^)
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
-    }
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp2, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
-                          typename V_TypeTraits<_Tp>::abs_type)
-OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
-
-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i], b.s[i]); \
-    return c; \
-}
-
-#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
-{ \
-    _Tp c = a.s[0]; \
-    for( int i = 1; i < n; i++ ) \
-        c = cfunc(c, a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
 
 static const unsigned char popCountTable[] =
 {
@@ -492,1184 +291,7 @@ static const unsigned char popCountTable[] =
     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
 };
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
-    for (int i = 0; i < (int)(n*sizeof(_Tp)); i++)
-        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        minval.s[i] = std::min(a.s[i], b.s[i]);
-        maxval.s[i] = std::max(a.s[i], b.s[i]);
-    }
-}
-
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_CMP_OP(<)
-OPENCV_HAL_IMPL_CMP_OP(>)
-OPENCV_HAL_IMPL_CMP_OP(<=)
-OPENCV_HAL_IMPL_CMP_OP(>=)
-OPENCV_HAL_IMPL_CMP_OP(==)
-OPENCV_HAL_IMPL_CMP_OP(!=)
-
-template<int n>
-inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
-{
-    typedef typename V_TypeTraits<float>::int_type itype;
-    v_reg<float, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-template<int n>
-inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
-{
-    typedef typename V_TypeTraits<double>::int_type itype;
-    v_reg<double, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef _Tp2 rtype; \
-    v_reg<rtype, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
-
-template<typename T> inline T _absdiff(T a, T b)
-{
-    return a > b ? a - b : b - a;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
-{
-    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
-    v_reg<rtype, n> c;
-    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
-    for( int i = 0; i < n; i++ )
-    {
-        rtype ua = a.s[i] ^ mask;
-        rtype ub = b.s[i] ^ mask;
-        c.s[i] = _absdiff(ua, ub);
-    }
-    return c;
-}
-
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{
-    v_float64x2 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++)
-        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = 1.f/std::sqrt(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                           const v_reg<_Tp, n>& c)
-{
-    v_reg<_Tp, n> d;
-    for( int i = 0; i < n; i++ )
-        d.s[i] = a.s[i]*b.s[i] + c.s[i];
-    return d;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
-{
-    return v_fma(a, b, c);
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> c;
-    for( int i = 0; i < (n/2); i++ )
-        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> s;
-    for( int i = 0; i < (n/2); i++ )
-        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                     const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i]*b.s[i];
-        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
-                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
-    }
-}
-
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
-
-#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp, n> b; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int sIndex = i opA imm; \
-        if (0 <= sIndex && sIndex < n) \
-        { \
-            b.s[i] = a.s[sIndex]; \
-        } \
-        else \
-        { \
-            b.s[i] = 0; \
-        } \
-    } \
-    return b; \
-} \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int aIndex = i opA imm; \
-        int bIndex = i opA imm opB n; \
-        if (0 <= bIndex && bIndex < n) \
-        { \
-            c.s[i] = b.s[bIndex]; \
-        } \
-        else if (0 <= aIndex && aIndex < n) \
-        { \
-            c.s[i] = a.s[aIndex]; \
-        } \
-        else \
-        { \
-            c.s[i] = 0; \
-        } \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
-
-template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
-{
-    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
-    for( int i = 1; i < n; i++ )
-        c += a.s[i];
-    return c;
-}
-
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
-{
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
-    return r;
-}
-
-template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
-    for (int i = 1; i < n; i++)
-        c += _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
-{
-    int mask = 0;
-    for( int i = 0; i < n; i++ )
-        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
-    return mask;
-}
-
-template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
-            return false;
-    return true;
-}
-
-template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
-            return true;
-    return false;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
-                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef V_TypeTraits<_Tp> Traits;
-    typedef typename Traits::int_type int_type;
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        int_type m = Traits::reinterpret_int(mask.s[i]);
-        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
-        c.s[i] = m ? a.s[i] : b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        b0.s[i] = a.s[i];
-        b1.s[i] = a.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_low(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_high(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i+(n/2)];
-    return b;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
-                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
-{
-    int i;
-    for( i = 0; i < n/2; i++ )
-    {
-        b0.s[i*2] = a0.s[i];
-        b0.s[i*2+1] = a1.s[i];
-    }
-    for( ; i < n; i++ )
-    {
-        b1.s[i*2-n] = a0.s[i];
-        b1.s[i*2-n+1] = a1.s[i];
-    }
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = loptr[i];
-        c.s[i+c.nlanes/2] = hiptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
-v_load_expand(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
-v_load_expand_q(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        a.s[i] = ptr[i2];
-        b.s[i] = ptr[i2+1];
-    }
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        a.s[i] = ptr[i3];
-        b.s[i] = ptr[i3+1];
-        c.s[i] = ptr[i3+2];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
-                                v_reg<_Tp, n>& d)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        a.s[i] = ptr[i4];
-        b.s[i] = ptr[i4+1];
-        c.s[i] = ptr[i4+2];
-        d.s[i] = ptr[i4+3];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b,
-                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        ptr[i2] = a.s[i];
-        ptr[i2+1] = b.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        ptr[i3] = a.s[i];
-        ptr[i3+1] = b.s[i];
-        ptr[i3+2] = c.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d,
-                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        ptr[i4] = a.s[i];
-        ptr[i4+1] = b.s[i];
-        ptr[i4+2] = c.s[i];
-        ptr[i4+3] = d.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i+(n/2)];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i];
-        c.s[i+(n/2)] = b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i+(n/2)];
-        c.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        low.s[i] = a.s[i];
-        low.s[i+(n/2)] = b.s[i];
-        high.s[i] = a.s[i+(n/2)];
-        high.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<int s, typename _Tp, int n>
-inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> r;
-    const int shift = n - s;
-    int i = 0;
-    for (; i < shift; ++i)
-        r.s[i] = a.s[i+s];
-    for (; i < n; ++i)
-        r.s[i] = b.s[i-shift];
-    return r;
-}
-
-template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvRound(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = cvRound(b.s[i]);
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvFloor(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvCeil(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (int)(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvFloor(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (int)(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (float)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = (float)b.s[i];
-    }
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i]];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 2] + i % 2];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 4] + i % 4];
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
-                                               v_reg<float, n>& x, v_reg<float, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
-                                               v_reg<double, n>& x, v_reg<double, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[4*i  ] = vec.s[4*i  ];
-        c.s[4*i+1] = vec.s[4*i+2];
-        c.s[4*i+2] = vec.s[4*i+1];
-        c.s[4*i+3] = vec.s[4*i+3];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/8; i++)
-    {
-        c.s[8*i  ] = vec.s[8*i  ];
-        c.s[8*i+1] = vec.s[8*i+4];
-        c.s[8*i+2] = vec.s[8*i+1];
-        c.s[8*i+3] = vec.s[8*i+5];
-        c.s[8*i+4] = vec.s[8*i+2];
-        c.s[8*i+5] = vec.s[8*i+6];
-        c.s[8*i+6] = vec.s[8*i+3];
-        c.s[8*i+7] = vec.s[8*i+7];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[3*i  ] = vec.s[4*i  ];
-        c.s[3*i+1] = vec.s[4*i+1];
-        c.s[3*i+2] = vec.s[4*i+2];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
-{
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
-}
-
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
-
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
-
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
-    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
-
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
-
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
-
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>(a.s[i]); \
-}
-
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-template<typename _Tpm, typename _Tp, int n>
-inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    for (int i = 0; i < n; ++i)
-    {
-        mptr[i] = (_Tpm)a.s[i];
-        mptr[i + n] = (_Tpm)b.s[i];
-    }
-}
-
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
-    return mask;
-}
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
-}
-
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
-}
-
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
-v_load_expand(const float16_t* ptr)
-{
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        v.s[i] = ptr[i];
-    }
-    return v;
-}
-
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
-{
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        ptr[i] = float16_t(v.s[i]);
-    }
-}
-
-inline void v_cleanup() {}
-}  // namespace fallback
+}  // namespace
 
 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
     return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
@@ -2644,8 +1266,31 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
+// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
+// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return v_uint8x16(wasm_v128_load(a_));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return v_int8x16(wasm_v128_load(a_));
+}
+#else
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+#endif
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
 
@@ -2905,13 +1550,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_low(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
 } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_high(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
 }
 
 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
@@ -2977,8 +1626,12 @@ OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_reduce_sum(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
 }
 
 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
@@ -3102,8 +1755,11 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
-    fallback::v_uint64x2 a_(a);
-    return fallback::v_popcount(a_);
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
+    return v_uint64x2(wasm_v128_load(b_));
 }
 inline v_uint8x16 v_popcount(const v_int8x16& a)
 { return v_popcount(v_reinterpret_as_u8(a)); }
@@ -3117,8 +1773,12 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
 inline int v_signmask(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_signmask(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
 } \
 inline bool v_check_all(const _Tpvec& a) \
 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
@@ -3273,22 +1933,35 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
 
-#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc, _Tpvec, _Tpnvec, _Tp, _Tpn) \
-inline _Tpnvec func(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::func(a_); \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[0]); \
+    c_[1] = cfunc(a_[1]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return v_int32x4(wasm_v128_load(c_)); \
 }
 
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int, v_float64x2, v_int32x4, double, int)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_round(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return v_int32x4(wasm_v128_load(c_));
 }
 
 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
@@ -3782,14 +2455,27 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
-    fallback::v_float64x2 a_(a);
-    return fallback::v_cvt_f32(a_);
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return v_float32x4(wasm_v128_load(c_));
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_cvt_f32(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return v_float32x4(wasm_v128_load(c_));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
@@ -3798,8 +2484,12 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
 #endif
 }
 
@@ -3809,21 +2499,33 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2_high(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
 #endif
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -3831,8 +2533,12 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #ifdef __wasm_unimplemented_simd128__
     return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
 #else
-    fallback::v_int64x2 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
 #endif
 }
 
@@ -4049,13 +2755,20 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    return fallback::v_load_expand(ptr);
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return v_float32x4(wasm_v128_load(a));
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    fallback::v_float32x4 v_(v);
-    fallback::v_pack_store(ptr, v_);
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = float16_t(v_[0]);
+    ptr[1] = float16_t(v_[1]);
+    ptr[2] = float16_t(v_[2]);
+    ptr[3] = float16_t(v_[3]);
 }
 
 inline void v_cleanup() {}
diff --git a/modules/core/include/opencv2/core/llapi/llapi.h b/modules/core/include/opencv2/core/llapi/llapi.h
index 805d9ed262..ce322aecf8 100644
--- a/modules/core/include/opencv2/core/llapi/llapi.h
+++ b/modules/core/include/opencv2/core/llapi/llapi.h
@@ -27,6 +27,14 @@ Using this approach OpenCV provides some basic low level functionality for exter
 #define CV_API_CALL
 #endif
 
+#ifndef CV_PLUGIN_EXPORTS
+#if (defined _WIN32 || defined WINCE || defined __CYGWIN__)
+#  define CV_PLUGIN_EXPORTS __declspec(dllexport)
+#elif defined __GNUC__ && __GNUC__ >= 4
+#  define CV_PLUGIN_EXPORTS __attribute__ ((visibility ("default")))
+#endif
+#endif
+
 typedef enum cvResult
 {
     CV_ERROR_FAIL = -1,                          //!< Some error occurred (TODO Require to fill exception information)
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index bc676c1acd..84df297bf9 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -170,7 +170,9 @@ public:
         STD_VECTOR        = 3 << KIND_SHIFT,
         STD_VECTOR_VECTOR = 4 << KIND_SHIFT,
         STD_VECTOR_MAT    = 5 << KIND_SHIFT,
-        EXPR              = 6 << KIND_SHIFT,  //!< removed
+#if OPENCV_ABI_COMPATIBILITY < 500
+        EXPR              = 6 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/pull/17046
+#endif
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
         CUDA_HOST_MEM     = 8 << KIND_SHIFT,
         CUDA_GPU_MAT      = 9 << KIND_SHIFT,
@@ -178,7 +180,9 @@ public:
         STD_VECTOR_UMAT   =11 << KIND_SHIFT,
         STD_BOOL_VECTOR   =12 << KIND_SHIFT,
         STD_VECTOR_CUDA_GPU_MAT = 13 << KIND_SHIFT,
-        STD_ARRAY         =14 << KIND_SHIFT,
+#if OPENCV_ABI_COMPATIBILITY < 500
+        STD_ARRAY         =14 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/issues/18897
+#endif
         STD_ARRAY_MAT     =15 << KIND_SHIFT
     };
 
@@ -572,24 +576,24 @@ CV_ENUM_FLAGS(UMatData::MemoryFlag)
 
 struct CV_EXPORTS MatSize
 {
-    explicit MatSize(int* _p);
-    int dims() const;
+    explicit MatSize(int* _p) CV_NOEXCEPT;
+    int dims() const CV_NOEXCEPT;
     Size operator()() const;
     const int& operator[](int i) const;
     int& operator[](int i);
-    operator const int*() const;  // TODO OpenCV 4.0: drop this
-    bool operator == (const MatSize& sz) const;
-    bool operator != (const MatSize& sz) const;
+    operator const int*() const CV_NOEXCEPT;  // TODO OpenCV 4.0: drop this
+    bool operator == (const MatSize& sz) const CV_NOEXCEPT;
+    bool operator != (const MatSize& sz) const CV_NOEXCEPT;
 
     int* p;
 };
 
 struct CV_EXPORTS MatStep
 {
-    MatStep();
-    explicit MatStep(size_t s);
-    const size_t& operator[](int i) const;
-    size_t& operator[](int i);
+    MatStep() CV_NOEXCEPT;
+    explicit MatStep(size_t s) CV_NOEXCEPT;
+    const size_t& operator[](int i) const CV_NOEXCEPT;
+    size_t& operator[](int i) CV_NOEXCEPT;
     operator size_t() const;
     MatStep& operator = (size_t s);
 
@@ -694,11 +698,16 @@ sub-matrices.
     -# Process "foreign" data using OpenCV (for example, when you implement a DirectShow\* filter or
     a processing module for gstreamer, and so on). For example:
     @code
-        void process_video_frame(const unsigned char* pixels,
-                                 int width, int height, int step)
+        Mat process_video_frame(const unsigned char* pixels,
+                                int width, int height, int step)
         {
-            Mat img(height, width, CV_8UC3, pixels, step);
-            GaussianBlur(img, img, Size(7,7), 1.5, 1.5);
+            // wrap input buffer
+            Mat img(height, width, CV_8UC3, (unsigned char*)pixels, step);
+
+            Mat result;
+            GaussianBlur(img, result, Size(7, 7), 1.5, 1.5);
+
+            return result;
         }
     @endcode
     -# Quickly initialize small matrices and/or get a super-fast element access.
@@ -798,7 +807,7 @@ public:
     The constructed matrix can further be assigned to another matrix or matrix expression or can be
     allocated with Mat::create . In the former case, the old content is de-referenced.
      */
-    Mat();
+    Mat() CV_NOEXCEPT;
 
     /** @overload
     @param rows Number of rows in a 2D array.
@@ -2184,7 +2193,7 @@ public:
     typedef MatConstIterator_<_Tp> const_iterator;
 
     //! default constructor
-    Mat_();
+    Mat_() CV_NOEXCEPT;
     //! equivalent to Mat(_rows, _cols, DataType<_Tp>::type)
     Mat_(int _rows, int _cols);
     //! constructor that sets each matrix element to specified value
@@ -2376,7 +2385,7 @@ class CV_EXPORTS UMat
 {
 public:
     //! default constructor
-    UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT) CV_NOEXCEPT;
     //! constructs 2D matrix of the specified size and type
     // (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
     UMat(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
@@ -2397,20 +2406,11 @@ public:
     UMat(const UMat& m, const Rect& roi);
     UMat(const UMat& m, const Range* ranges);
     UMat(const UMat& m, const std::vector<Range>& ranges);
+
+    // FIXIT copyData=false is not implemented, drop this in favor of cv::Mat (OpenCV 5.0)
     //! builds matrix from std::vector with or without copying the data
     template<typename _Tp> explicit UMat(const std::vector<_Tp>& vec, bool copyData=false);
 
-    //! builds matrix from cv::Vec; the data is copied by default
-    template<typename _Tp, int n> explicit UMat(const Vec<_Tp, n>& vec, bool copyData=true);
-    //! builds matrix from cv::Matx; the data is copied by default
-    template<typename _Tp, int m, int n> explicit UMat(const Matx<_Tp, m, n>& mtx, bool copyData=true);
-    //! builds matrix from a 2D point
-    template<typename _Tp> explicit UMat(const Point_<_Tp>& pt, bool copyData=true);
-    //! builds matrix from a 3D point
-    template<typename _Tp> explicit UMat(const Point3_<_Tp>& pt, bool copyData=true);
-    //! builds matrix from comma initializer
-    template<typename _Tp> explicit UMat(const MatCommaInitializer_<_Tp>& commaInitializer);
-
     //! destructor - calls release()
     ~UMat();
     //! assignment operators
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index d6296f8e2e..ff8297ffa4 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -111,7 +111,7 @@ _InputArray::_InputArray(const std::vector<_Tp>& vec)
 
 template<typename _Tp, std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
-{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
 
 template<std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
@@ -169,7 +169,7 @@ template<typename _Tp, std::size_t _Nm> inline
 _InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
 {
     _InputArray v;
-    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ;
     v.obj = (void*)arr.data();
     v.sz = Size(1, _Nm);
     return v;
@@ -191,7 +191,7 @@ inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::S
 inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
 inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
                                                    kind() == _InputArray::STD_BOOL_VECTOR ||
-                                                   kind() == _InputArray::STD_ARRAY; }
+                                                   (kind() == _InputArray::MATX && (sz.width <= 1 || sz.height <= 1)); }
 inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
 inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }
 
@@ -210,7 +210,7 @@ _OutputArray::_OutputArray(std::vector<_Tp>& vec)
 
 template<typename _Tp, std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
-{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
 
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
@@ -242,7 +242,7 @@ _OutputArray::_OutputArray(const std::vector<_Tp>& vec)
 
 template<typename _Tp, std::size_t _Nm> inline
 _OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
-{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
 
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(const std::array<Mat, _Nm>& arr)
@@ -315,7 +315,7 @@ template<typename _Tp, std::size_t _Nm> inline
 _OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
 {
     _OutputArray v;
-    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE;
     v.obj = (void*)arr.data();
     v.sz = Size(1, _Nm);
     return v;
@@ -336,7 +336,7 @@ _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
 
 template<typename _Tp, std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
-{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
 
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
@@ -368,7 +368,7 @@ _InputOutputArray::_InputOutputArray(const std::vector<_Tp>& vec)
 
 template<typename _Tp, std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
-{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
 
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(const std::array<Mat, _Nm>& arr)
@@ -443,7 +443,7 @@ template<typename _Tp, std::size_t _Nm> inline
 _InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
 {
     _InputOutputArray v;
-    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW;
     v.obj = (void*)arr.data();
     v.sz = Size(1, _Nm);
     return v;
@@ -1116,11 +1116,11 @@ void Mat::push_back(const std::vector<_Tp>& v)
 ///////////////////////////// MatSize ////////////////////////////
 
 inline
-MatSize::MatSize(int* _p)
+MatSize::MatSize(int* _p) CV_NOEXCEPT
     : p(_p) {}
 
 inline
-int MatSize::dims() const
+int MatSize::dims() const CV_NOEXCEPT
 {
     return (p - 1)[0];
 }
@@ -1153,13 +1153,13 @@ int& MatSize::operator[](int i)
 }
 
 inline
-MatSize::operator const int*() const
+MatSize::operator const int*() const CV_NOEXCEPT
 {
     return p;
 }
 
 inline
-bool MatSize::operator != (const MatSize& sz) const
+bool MatSize::operator != (const MatSize& sz) const CV_NOEXCEPT
 {
     return !(*this == sz);
 }
@@ -1169,25 +1169,25 @@ bool MatSize::operator != (const MatSize& sz) const
 ///////////////////////////// MatStep ////////////////////////////
 
 inline
-MatStep::MatStep()
+MatStep::MatStep() CV_NOEXCEPT
 {
     p = buf; p[0] = p[1] = 0;
 }
 
 inline
-MatStep::MatStep(size_t s)
+MatStep::MatStep(size_t s) CV_NOEXCEPT
 {
     p = buf; p[0] = s; p[1] = 0;
 }
 
 inline
-const size_t& MatStep::operator[](int i) const
+const size_t& MatStep::operator[](int i) const CV_NOEXCEPT
 {
     return p[i];
 }
 
 inline
-size_t& MatStep::operator[](int i)
+size_t& MatStep::operator[](int i) CV_NOEXCEPT
 {
     return p[i];
 }
@@ -1210,7 +1210,7 @@ inline MatStep& MatStep::operator = (size_t s)
 ////////////////////////////// Mat_<_Tp> ////////////////////////////
 
 template<typename _Tp> inline
-Mat_<_Tp>::Mat_()
+Mat_<_Tp>::Mat_() CV_NOEXCEPT
     : Mat()
 {
     flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 3315832654..3a76be2353 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -70,10 +70,12 @@ class CV_EXPORTS Image2D;
 class CV_EXPORTS_W_SIMPLE Device
 {
 public:
-    CV_WRAP Device();
+    CV_WRAP Device() CV_NOEXCEPT;
     explicit Device(void* d);
     Device(const Device& d);
     Device& operator = (const Device& d);
+    Device(Device&& d) CV_NOEXCEPT;
+    Device& operator = (Device&& d) CV_NOEXCEPT;
     CV_WRAP ~Device();
 
     void set(void* d);
@@ -245,11 +247,13 @@ protected:
 class CV_EXPORTS Context
 {
 public:
-    Context();
+    Context() CV_NOEXCEPT;
     explicit Context(int dtype);  //!< @deprecated
     ~Context();
     Context(const Context& c);
     Context& operator= (const Context& c);
+    Context(Context&& c) CV_NOEXCEPT;
+    Context& operator = (Context&& c) CV_NOEXCEPT;
 
     /** @deprecated */
     bool create();
@@ -298,10 +302,12 @@ public:
 class CV_EXPORTS Platform
 {
 public:
-    Platform();
+    Platform() CV_NOEXCEPT;
     ~Platform();
     Platform(const Platform& p);
     Platform& operator = (const Platform& p);
+    Platform(Platform&& p) CV_NOEXCEPT;
+    Platform& operator = (Platform&& p) CV_NOEXCEPT;
 
     void* ptr() const;
 
@@ -357,11 +363,13 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* context, vo
 class CV_EXPORTS Queue
 {
 public:
-    Queue();
+    Queue() CV_NOEXCEPT;
     explicit Queue(const Context& c, const Device& d=Device());
     ~Queue();
     Queue(const Queue& q);
     Queue& operator = (const Queue& q);
+    Queue(Queue&& q) CV_NOEXCEPT;
+    Queue& operator = (Queue&& q) CV_NOEXCEPT;
 
     bool create(const Context& c=Context(), const Device& d=Device());
     void finish();
@@ -384,7 +392,7 @@ class CV_EXPORTS KernelArg
 public:
     enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 };
     KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0);
-    KernelArg();
+    KernelArg() CV_NOEXCEPT;
 
     static KernelArg Local(size_t localMemSize)
     { return KernelArg(LOCAL, 0, 1, 1, 0, localMemSize); }
@@ -421,13 +429,15 @@ public:
 class CV_EXPORTS Kernel
 {
 public:
-    Kernel();
+    Kernel() CV_NOEXCEPT;
     Kernel(const char* kname, const Program& prog);
     Kernel(const char* kname, const ProgramSource& prog,
            const String& buildopts = String(), String* errmsg=0);
     ~Kernel();
     Kernel(const Kernel& k);
     Kernel& operator = (const Kernel& k);
+    Kernel(Kernel&& k) CV_NOEXCEPT;
+    Kernel& operator = (Kernel&& k) CV_NOEXCEPT;
 
     bool empty() const;
     bool create(const char* kname, const Program& prog);
@@ -498,12 +508,13 @@ protected:
 class CV_EXPORTS Program
 {
 public:
-    Program();
+    Program() CV_NOEXCEPT;
     Program(const ProgramSource& src,
             const String& buildflags, String& errmsg);
     Program(const Program& prog);
-
     Program& operator = (const Program& prog);
+    Program(Program&& prog) CV_NOEXCEPT;
+    Program& operator = (Program&& prog) CV_NOEXCEPT;
     ~Program();
 
     bool create(const ProgramSource& src,
@@ -544,13 +555,15 @@ class CV_EXPORTS ProgramSource
 public:
     typedef uint64 hash_t; // deprecated
 
-    ProgramSource();
+    ProgramSource() CV_NOEXCEPT;
     explicit ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash);
     explicit ProgramSource(const String& prog); // deprecated
     explicit ProgramSource(const char* prog); // deprecated
     ~ProgramSource();
     ProgramSource(const ProgramSource& prog);
     ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource(ProgramSource&& prog) CV_NOEXCEPT;
+    ProgramSource& operator = (ProgramSource&& prog) CV_NOEXCEPT;
 
     const String& source() const; // deprecated
     hash_t hash() const; // deprecated
@@ -614,7 +627,7 @@ protected:
 class CV_EXPORTS PlatformInfo
 {
 public:
-    PlatformInfo();
+    PlatformInfo() CV_NOEXCEPT;
     /**
      * @param id pointer cl_platform_id (cl_platform_id*)
      */
@@ -623,10 +636,17 @@ public:
 
     PlatformInfo(const PlatformInfo& i);
     PlatformInfo& operator =(const PlatformInfo& i);
+    PlatformInfo(PlatformInfo&& i) CV_NOEXCEPT;
+    PlatformInfo& operator = (PlatformInfo&& i) CV_NOEXCEPT;
 
     String name() const;
     String vendor() const;
+
+    /// See CL_PLATFORM_VERSION
     String version() const;
+    int versionMajor() const;
+    int versionMinor() const;
+
     int deviceNumber() const;
     void getDevice(Device& device, int d) const;
 
@@ -678,7 +698,7 @@ CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const Str
 class CV_EXPORTS Image2D
 {
 public:
-    Image2D();
+    Image2D() CV_NOEXCEPT;
 
     /**
     @param src UMat object from which to get image properties and data
@@ -691,6 +711,8 @@ public:
     ~Image2D();
 
     Image2D & operator = (const Image2D & i);
+    Image2D(Image2D &&) CV_NOEXCEPT;
+    Image2D &operator=(Image2D &&) CV_NOEXCEPT;
 
     /** Indicates if creating an aliased image should succeed.
     Depends on the underlying platform and the dimensions of the UMat.
@@ -743,9 +765,11 @@ public:
 
     /** Get associated ocl::Context */
     Context& getContext() const;
-    /** Get associated ocl::Device */
+    /** Get the single default associated ocl::Device */
     Device& getDevice() const;
-    /** Get associated ocl::Queue */
+    /** Get the single ocl::Queue that is associated with the ocl::Context and
+     *  the single default ocl::Device
+     */
     Queue& getQueue() const;
 
     bool useOpenCL() const;
diff --git a/modules/core/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp b/modules/core/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
new file mode 100644
index 0000000000..b172cac34d
--- /dev/null
+++ b/modules/core/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+#define OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+#if !defined(_OPENMP) && !defined(OPENCV_SKIP_OPENMP_PRESENSE_CHECK)
+#error "This file must be compiled with enabled OpenMP"
+#endif
+
+#include <omp.h>
+
+namespace cv { namespace parallel { namespace openmp {
+
+/** OpenMP parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        numThreads = 0;
+        numThreadsMax = omp_get_max_threads();
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
+        for (int i = 0; i < tasks; ++i)
+            body_callback(i, i + 1, callback_data);
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+        return omp_get_thread_num();
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+        return numThreads > 0
+               ? numThreads
+               : numThreadsMax;
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+        // nothing needed as numThreads is used in #pragma omp parallel for directly
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "openmp";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
diff --git a/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp b/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
new file mode 100644
index 0000000000..264def5f50
--- /dev/null
+++ b/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+#define OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+#ifndef TBB_SUPPRESS_DEPRECATED_MESSAGES  // supress warning
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#endif
+#include "tbb/tbb.h"
+#if !defined(TBB_INTERFACE_VERSION)
+#error "Unknows/unsupported TBB version"
+#endif
+
+#if TBB_INTERFACE_VERSION >= 8000
+#include "tbb/task_arena.h"
+#endif
+
+namespace cv { namespace parallel { namespace tbb {
+
+using namespace ::tbb;
+
+#if TBB_INTERFACE_VERSION >= 8000
+static tbb::task_arena& getArena()
+{
+    static tbb::task_arena tbbArena(tbb::task_arena::automatic);
+    return tbbArena;
+}
+#else
+static tbb::task_scheduler_init& getScheduler()
+{
+    static tbb::task_scheduler_init tbbScheduler(tbb::task_scheduler_init::deferred);
+    return tbbScheduler;
+}
+#endif
+
+/** OpenMP parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        CV_LOG_INFO(NULL, "Initializing TBB parallel backend: TBB_INTERFACE_VERSION=" << TBB_INTERFACE_VERSION);
+        numThreads = 0;
+#if TBB_INTERFACE_VERSION >= 8000
+        (void)getArena();
+#else
+        (void)getScheduler();
+#endif
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    class CallbackProxy
+    {
+        const FN_parallel_for_body_cb_t& callback;
+        void* const callback_data;
+        const int tasks;
+    public:
+        inline CallbackProxy(int tasks_, FN_parallel_for_body_cb_t& callback_, void* callback_data_)
+            : callback(callback_), callback_data(callback_data_), tasks(tasks_)
+        {
+            // nothing
+        }
+
+        void operator()(const tbb::blocked_range<int>& range) const
+        {
+            this->callback(range.begin(), range.end(), callback_data);
+        }
+
+        void operator()() const
+        {
+            tbb::parallel_for(tbb::blocked_range<int>(0, tasks), *this);
+        }
+    };
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+        CallbackProxy task(tasks, body_callback, callback_data);
+#if TBB_INTERFACE_VERSION >= 8000
+        getArena().execute(task);
+#else
+        task();
+#endif
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+        return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 8000
+        return tbb::task_arena::current_thread_index();
+#else
+        return 0;
+#endif
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+    return getArena().max_concurrency();
+#elif TBB_INTERFACE_VERSION >= 8000
+    return numThreads > 0
+        ? numThreads
+        : tbb::task_scheduler_init::default_num_threads();
+#else
+    return getScheduler().is_active()
+           ? numThreads
+           : tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+
+#if TBB_INTERFACE_VERSION >= 8000
+        auto& tbbArena = getArena();
+        if (tbbArena.is_active())
+            tbbArena.terminate();
+        if (numThreads > 0)
+            tbbArena.initialize(numThreads);
+#else
+        auto& tbbScheduler = getScheduler();
+        if (tbbScheduler.is_active())
+            tbbScheduler.terminate();
+        if (numThreads > 0)
+            tbbScheduler.initialize(numThreads);
+#endif
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "tbb";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_TBB_HPP
diff --git a/modules/core/include/opencv2/core/parallel/parallel_backend.hpp b/modules/core/include/opencv2/core/parallel/parallel_backend.hpp
new file mode 100644
index 0000000000..c3e8333c1c
--- /dev/null
+++ b/modules/core/include/opencv2/core/parallel/parallel_backend.hpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_BACKEND_HPP
+#define OPENCV_CORE_PARALLEL_BACKEND_HPP
+
+#include "opencv2/core/cvdef.h"
+#include <memory>
+
+namespace cv { namespace parallel {
+#ifndef CV_API_CALL
+#define CV_API_CALL
+#endif
+
+/** @addtogroup core_parallel_backend
+ * @{
+ * API below is provided to resolve problem of CPU resource over-subscription by multiple thread pools from different multi-threading frameworks.
+ * This is common problem for cases when OpenCV compiled threading framework is different from the Users Applications framework.
+ *
+ * Applications can replace OpenCV `parallel_for()` backend with own implementation (to reuse Application's thread pool).
+ *
+ *
+ * ### Backend API usage examples
+ *
+ * #### Intel TBB
+ *
+ * - include header with simple implementation of TBB backend:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_backend
+ * - configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ * #### OpenMP
+ *
+ * - include header with simple implementation of OpenMP backend:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_backend
+ * - Configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ *
+ * ### Plugins support
+ *
+ * Runtime configuration options:
+ * - change backend priority: `OPENCV_PARALLEL_PRIORITY_<backend>=9999`
+ * - disable backend: `OPENCV_PARALLEL_PRIORITY_<backend>=0`
+ * - specify list of backends with high priority (>100000): `OPENCV_PARALLEL_PRIORITY_LIST=TBB,OPENMP`. Unknown backends are registered as new plugins.
+ *
+ */
+
+/** Interface for parallel_for backends implementations
+ *
+ * @sa setParallelForBackend
+ */
+class CV_EXPORTS ParallelForAPI
+{
+public:
+    virtual ~ParallelForAPI();
+
+    typedef void (CV_API_CALL *FN_parallel_for_body_cb_t)(int start, int end, void* data);
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) = 0;
+
+    virtual int getThreadNum() const = 0;
+
+    virtual int getNumThreads() const = 0;
+
+    virtual int setNumThreads(int nThreads) = 0;
+
+    virtual const char* getName() const = 0;
+};
+
+/** @brief Replace OpenCV parallel_for backend
+ *
+ * Application can replace OpenCV `parallel_for()` backend with own implementation.
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS void setParallelForBackend(const std::shared_ptr<ParallelForAPI>& api, bool propagateNumThreads = true);
+
+/** @brief Change OpenCV parallel_for backend
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS_W bool setParallelForBackend(const std::string& backendName, bool propagateNumThreads = true);
+
+//! @}
+}}  // namespace
+#endif  // OPENCV_CORE_PARALLEL_BACKEND_HPP
diff --git a/modules/core/include/opencv2/core/quaternion.hpp b/modules/core/include/opencv2/core/quaternion.hpp
index c72ee8c37f..8c21501e3f 100644
--- a/modules/core/include/opencv2/core/quaternion.hpp
+++ b/modules/core/include/opencv2/core/quaternion.hpp
@@ -27,6 +27,7 @@
 #define OPENCV_CORE_QUATERNION_HPP
 
 #include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
 #include <iostream>
 namespace cv
 {
@@ -51,6 +52,83 @@ enum QuatAssumeType
     QUAT_ASSUME_UNIT
 };
 
+class QuatEnum
+{
+public:
+    /** @brief Enum of Euler angles type.
+     *
+     * Without considering the possibility of using two different convertions for the definition of the rotation axes ,
+     * there exists twelve possible sequences of rotation axes, divided into two groups:
+     * - Proper Euler angles (Z-X-Z, X-Y-X, Y-Z-Y, Z-Y-Z, X-Z-X, Y-X-Y)
+     * - Tait–Bryan angles (X-Y-Z, Y-Z-X, Z-X-Y, X-Z-Y, Z-Y-X, Y-X-Z).
+     *
+     * The three elemental rotations may be [extrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_extrinsic_rotations)
+     * (rotations about the axes *xyz* of the original coordinate system, which is assumed to remain motionless),
+     * or [intrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_intrinsic_rotations)(rotations about the axes of the rotating coordinate system *XYZ*, solidary with the moving body, which changes its orientation after each elemental rotation).
+     *
+     *
+     * Extrinsic and intrinsic rotations are relevant.
+     *
+     * The definition of the Euler angles is as following,
+     * - \f$\theta_1 \f$ represents the first rotation angle,
+     * - \f$\theta_2 \f$ represents the second rotation angle,
+     * - \f$\theta_3 \f$ represents the third rotation angle.
+     *
+     * For intrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =X(\theta_1) Y(\theta_2) Z(\theta_3) \f]
+     * For extrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =Z({\theta_3}) Y({\theta_2}) X({\theta_1})\f]
+     * where
+     * \f[X({\theta})={\begin{bmatrix}1&0&0\\0&\cos {\theta_1} &-\sin {\theta_1} \\0&\sin {\theta_1} &\cos {\theta_1} \\\end{bmatrix}},
+     * Y({\theta})={\begin{bmatrix}\cos \theta_{2}&0&\sin \theta_{2}\\0&1 &0 \\\ -sin \theta_2& 0&\cos \theta_{2} \\\end{bmatrix}},
+     * Z({\theta})={\begin{bmatrix}\cos\theta_{3} &-\sin \theta_3&0\\\sin \theta_3 &\cos \theta_3 &0\\0&0&1\\\end{bmatrix}}.
+     * \f]
+     *
+     * The function is designed according to this set of conventions:
+     * - [Right handed](https://en.wikipedia.org/wiki/Right_hand_rule) reference frames are adopted, and the [right hand rule](https://en.wikipedia.org/wiki/Right_hand_rule) is used to determine the sign of angles.
+     * - Each matrix is meant to represent an [active rotation](https://en.wikipedia.org/wiki/Active_and_passive_transformation) (the composing and composed matrices
+     * are supposed to act on the coordinates of vectors defined in the initial fixed reference frame and give as a result the coordinates of a rotated vector defined in the same reference frame).
+     * - For \f$\theta_1\f$ and \f$\theta_3\f$, the valid range is (−π, π].
+     *
+     *   For \f$\theta_2\f$, the valid range is [−π/2, π/2] or [0, π].
+     *
+     *   For Tait–Bryan angles, the valid range of \f$\theta_2\f$ is [−π/2, π/2]. When transforming a quaternion to Euler angles, the solution of Euler angles is unique in condition of \f$ \theta_2 \in (−π/2, π/2)\f$ .
+     *   If \f$\theta_2 = −π/2 \f$ or \f$ \theta_2 = π/2\f$, there are infinite solutions. The common name for this situation is gimbal lock.
+     *   For Proper Euler angles,the valid range of \f$\theta_2\f$ is in [0, π]. The solutions of Euler angles are unique in condition of  \f$ \theta_2 \in (0, π)\f$ . If \f$\theta_2 =0 \f$ or \f$\theta_2 =π \f$,
+     *   there are infinite solutions and gimbal lock will occur.
+     */
+    enum EulerAnglesType
+    {
+        INT_XYZ, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        INT_XZY, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        INT_YXZ, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        INT_YZX, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        INT_ZXY, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        INT_ZYX, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        INT_XYX, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        INT_XZX, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        INT_YXY, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        INT_YZY, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        INT_ZXZ, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        INT_ZYZ, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+
+        EXT_XYZ, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        EXT_XZY, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        EXT_YXZ, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        EXT_YZX, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        EXT_ZXY, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        EXT_ZYX, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        EXT_XYX, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        EXT_XZX, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        EXT_YXY, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        EXT_YZY,  ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        EXT_ZXZ, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        EXT_ZYZ, ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+        #ifndef CV_DOXYGEN
+            EULER_ANGLES_MAX_VALUE
+        #endif
+    };
+
+};
+
 template <typename _Tp> class Quat;
 template <typename _Tp> std::ostream& operator<<(std::ostream&, const Quat<_Tp>&);
 
@@ -133,9 +211,9 @@ class Quat
 {
     static_assert(std::is_floating_point<_Tp>::value, "Quaternion only make sense with type of float or double");
     using value_type = _Tp;
-
 public:
     static constexpr _Tp CV_QUAT_EPS = (_Tp)1.e-6;
+    static constexpr _Tp CV_QUAT_CONVERT_THRESHOLD = (_Tp)1.e-6;
 
     Quat();
 
@@ -182,6 +260,41 @@ public:
      */
     static Quat<_Tp> createFromRvec(InputArray rvec);
 
+     /**
+     * @brief
+     * from Euler angles
+     *
+     * A quaternion can be generated from Euler angles by combining the quaternion representations of the Euler rotations.
+     *
+     * For example, if we use intrinsic rotations in the order of X-Y-Z,\f$\theta_1 \f$ is rotation around the X-axis, \f$\theta_2 \f$ is rotation around the Y-axis,
+     * \f$\theta_3 \f$ is rotation around the Z-axis. The final quaternion q can be calculated by
+     *
+     * \f[ {q} = q_{X, \theta_1}  q_{Y, \theta_2} q_{Z, \theta_3}\f]
+     * where \f$ q_{X, \theta_1} \f$ is created from @ref createFromXRot,  \f$ q_{Y, \theta_2} \f$ is created from @ref createFromYRot,
+     *  \f$ q_{Z, \theta_3} \f$ is created from @ref createFromZRot.
+     * @param angles the Euler angles in a vector of length 3
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+    static Quat<_Tp> createFromEulerAngles(const Vec<_Tp, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType);
+
+    /**
+     * @brief get a quaternion from a rotation about the Y-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+0 i+ sin(\theta/2) j +0k \f]
+     */
+    static Quat<_Tp> createFromYRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the X-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+sin(\theta/2) i +0 j +0 k \f]
+     */
+    static Quat<_Tp> createFromXRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the Z-axis by \f$\theta\f$.
+     * \f[q = \cos(\theta/2)+0 i +0 j +sin(\theta/2) k \f]
+     */
+    static Quat<_Tp> createFromZRot(const _Tp theta);
+
     /**
      * @brief a way to get element.
      * @param index over a range [0, 3].
@@ -277,17 +390,18 @@ public:
      * For example
      * ```
      * Quatd q(1,2,3,4);
-     * power(q, 2);
+     * power(q, 2.0);
      *
      * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
      * double angle = CV_PI;
      * Vec3d axis{0, 0, 1};
      * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
-     * power(q1, 2, assumeUnit);//This assumeUnit means q1 is a unit quaternion.
+     * power(q1, 2.0, assumeUnit);//This assumeUnit means q1 is a unit quaternion.
      * ```
+     * @note the type of the index should be the same as the quaternion.
      */
-    template <typename T, typename _T>
-    friend Quat<T> power(const Quat<T> &q, _T x, QuatAssumeType assumeUnit);
+    template <typename T>
+    friend Quat<T> power(const Quat<T> &q, const T x, QuatAssumeType assumeUnit);
 
     /**
      * @brief return the value of power function with index \f$x\f$.
@@ -298,17 +412,16 @@ public:
      * For example
      * ```
      * Quatd q(1,2,3,4);
-     * q.power(2);
+     * q.power(2.0);
      *
      * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
      * double angle = CV_PI;
      * Vec3d axis{0, 0, 1};
      * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
-     * q1.power(2, assumeUnit); //This assumeUnt means q1 is a unit quaternion
+     * q1.power(2.0, assumeUnit); //This assumeUnt means q1 is a unit quaternion
      * ```
      */
-    template <typename _T>
-    Quat<_Tp> power(_T x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+    Quat<_Tp> power(const _Tp x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
 
     /**
      * @brief return \f$\sqrt{q}\f$.
@@ -811,8 +924,8 @@ public:
     /**
      * @brief transform a quaternion to a 3x3 rotation matrix.
      * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
-     * this function will save some computations. Otherwise, this function will normalized this
-     * quaternion at first then to do the transformation.
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
      *
      * @note Matrix A which is to be rotated should have the form
      * \f[\begin{bmatrix}
@@ -845,8 +958,8 @@ public:
     /**
      * @brief transform a quaternion to a 4x4 rotation matrix.
      * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
-     * this function will save some computations. Otherwise, this function will normalized this
-     * quaternion at first then to do the transformation.
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
      *
      * The operations is similar as toRotMat3x3
      * except that the points matrix should have the form
@@ -859,6 +972,7 @@ public:
      *
      * @sa toRotMat3x3
      */
+
     Matx<_Tp, 4, 4> toRotMat4x4(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
 
     /**
@@ -1073,46 +1187,434 @@ public:
                             const Quat<_Tp> &q2, const Quat<_Tp> &q3,
                             const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
 
-
+    /**
+     * @brief Return opposite quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * Quatd q{1, 2, 3, 4};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4]
+     * ```
+     */
     Quat<_Tp> operator-() const;
 
+    /**
+     * @brief return true if two quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_QUAT_EPS.
+     */
     bool operator==(const Quat<_Tp>&) const;
 
+    /**
+     * @brief Addition operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12]
+     * ```
+     */
     Quat<_Tp> operator+(const Quat<_Tp>&) const;
 
+    /**
+     * @brief Addition assignment operator of two quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12]
+     *
+     * ```
+     */
     Quat<_Tp>& operator+=(const Quat<_Tp>&);
 
+    /**
+     * @brief Subtraction operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4]
+     * ```
+     */
     Quat<_Tp> operator-(const Quat<_Tp>&) const;
 
+    /**
+     * @brief Subtraction assignment operator of two quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4]
+     *
+     * ```
+     */
     Quat<_Tp>& operator-=(const Quat<_Tp>&);
 
+    /**
+     * @brief Multiplication assignment operator of two quaternions q and p.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p *= q; // equivalent to p = p * q
+     * std::cout << p << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
     Quat<_Tp>& operator*=(const Quat<_Tp>&);
 
-    Quat<_Tp>& operator*=(const _Tp&);
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p *= s; // equivalent to p = p * s
+     * std::cout << p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator*=(const _Tp s);
 
+    /**
+     * @brief Multiplication operator of two quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
     Quat<_Tp> operator*(const Quat<_Tp>&) const;
 
-    Quat<_Tp> operator/(const _Tp&) const;
+    /**
+     * @brief Division operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w/s, x/s, y/s, z/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2]
+     * ```
+     * @note the type of scalar should be equal to this quaternion.
+     */
+    Quat<_Tp> operator/(const _Tp s) const;
 
+    /**
+     * @brief Division operator of two quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
     Quat<_Tp> operator/(const Quat<_Tp>&) const;
 
-    Quat<_Tp>& operator/=(const _Tp&);
+    /**
+     * @brief Division assignment operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w / s, x / s, y / s, z / s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
 
+    /**
+     * @brief Division assignment operator of two quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
     Quat<_Tp>& operator/=(const Quat<_Tp>&);
 
     _Tp& operator[](std::size_t n);
 
     const _Tp& operator[](std::size_t n) const;
 
-    template <typename S, typename T>
-    friend Quat<S> cv::operator*(const T, const Quat<S>&);
+    /**
+     * @brief Subtraction operator of a scalar and a quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const T s, const Quat<T>&);
 
-    template <typename S, typename T>
-    friend Quat<S> cv::operator*(const Quat<S>&, const T);
+    /**
+     * @brief Subtraction operator of a quaternions and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const Quat<T>&, const T s);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const T s, const Quat<T>&);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const Quat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const T s, const Quat<T>&);
+
+    /**
+     * @brief Multiplication operator of a quaternion and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const Quat<T>&, const T s);
 
     template <typename S>
     friend std::ostream& cv::operator<<(std::ostream&, const Quat<S>&);
 
+    /**
+     * @brief Transform a quaternion q to Euler angles.
+     *
+     *
+     * When transforming a quaternion \f$q = w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\f$ to Euler angles, rotation matrix M can be calculated by:
+     * \f[ \begin{aligned} {M} &={\begin{bmatrix}1-2(y^{2}+z^{2})&2(xy-zx)&2(xz+yw)\\2(xy+zw)&1-2(x^{2}+z^{2})&2(yz-xw)\\2(xz-yw)&2(yz+xw)&1-2(x^{2}+y^{2})\end{bmatrix}}\end{aligned}.\f]
+     * On the other hand, the rotation matrix can be obtained from Euler angles.
+     * Using intrinsic rotations with Euler angles type XYZ as an example,
+     * \f$\theta_1 \f$, \f$\theta_2 \f$, \f$\theta_3 \f$ are three angles for Euler angles, the rotation matrix R can be calculated by:\f[R =X(\theta_1)Y(\theta_2)Z(\theta_3)
+     * ={\begin{bmatrix}\cos\theta_{2}\cos\theta_{3}&-\cos\theta_{2}\sin\theta_{3}&\sin\theta_{2}\\\cos\theta_{1}\sin\theta_{3}+\cos\theta_{3}\sin\theta_{1}\sin\theta_{2}&\cos\theta_{1}\cos\theta_{3}-\sin\theta_{1}\sin\theta_{2}\sin\theta_{3}&-\cos\theta_{2}\sin\theta_{1}\\\sin\theta_{1}\sin\theta_{3}-\cos\theta_{1}\cos\theta_{3}\sin\theta_{2}&\cos\theta_{3}\sin\theta_{1}+\cos\theta_{1}\sin\theta_{2}\sin\theta_{3}&\cos\theta_{1}\cos_{2}\end{bmatrix}}\f]
+     * Rotation matrix M and R are equal. As long as \f$ s_{2} \neq 1 \f$, by comparing each element of two matrices ,the solution is\f$\begin{cases} \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = arcsin(m_{13}) \\\theta_3 = \arctan2(-m_{12},m_{11}) \end{cases}\f$.
+     *
+     * When \f$ s_{2}=1\f$ or \f$ s_{2}=-1\f$, the gimbal lock occurs. The function will prompt "WARNING: Gimbal Lock will occur. Euler angles is non-unique. For intrinsic rotations, we set the third angle to 0, and for external rotation, we set the first angle to 0.".
+     *
+     * When \f$ s_{2}=1\f$ ,
+     * The rotation matrix R is \f$R = {\begin{bmatrix}0&0&1\\\sin(\theta_1+\theta_3)&\cos(\theta_1+\theta_3)&0\\-\cos(\theta_1+\theta_3)&\sin(\theta_1+\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{21},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$\begin{cases} \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \end{cases}\f$.
+     *
+     * When \f$ s_{2}=-1\f$,
+     * The rotation matrix R is \f$X_{1}Y_{2}Z_{3}={\begin{bmatrix}0&0&-1\\-\sin(\theta_1-\theta_3)&\cos(\theta_1-\theta_3)&0\\\cos(\theta_1-\theta_3)&\sin(\theta_1-\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{32},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$ \begin{cases}\theta_1=\arctan2(m_{32},m_{22}) \\ \theta_2=-\pi/2\\  \theta_3=0\end{cases}\f$.
+     *
+     * Since \f$ sin \theta\in [-1,1] \f$ and \f$ cos \theta \in [-1,1] \f$, the unnormalized quaternion will cause computational troubles. For this reason, this function will normalize the quaternion at first and @ref QuatAssumeType is not needed.
+     *
+     * When the gimbal lock occurs, we set \f$\theta_3 = 0\f$ for intrinsic rotations or \f$\theta_1 = 0\f$ for extrinsic rotations.
+     *
+     * As a result, for every Euler angles type, we can get solution as shown in the following table.
+     * EulerAnglesType  | Ordinary | \f$\theta_2 = π/2\f$ | \f$\theta_2 = -π/2\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYZ|\f$ \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{12},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{32},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_XZY|\f$ \theta_1 = \arctan2(m_{32},m_{22})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{13},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{31},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{23},m_{33})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YXZ|\f$ \theta_1 = \arctan2(m_{13},m_{33})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{12},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YZX|\f$ \theta_1 = \arctan2(-m_{31},m_{11})\\\theta_2 = \arcsin(m_{21}) \\\theta_3= \arctan2(-m_{23},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{13},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{13},m_{12})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZXY|\f$ \theta_1 = \arctan2(-m_{12},m_{22})\\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{31},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZYX|\f$ \theta_1 = \arctan2(m_{21},m_{11})\\\theta_2 = \arcsin(-m_{31}) \\\theta_3= \arctan2(m_{32},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * EXT_XYZ|\f$ \theta_1 = \arctan2(m_{32},m_{33})\\\theta_2 = \arcsin(-m_{31}) \\\ \theta_3 = \arctan2(m_{21},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{23},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{22}) \f$
+     * EXT_XZY|\f$ \theta_1 = \arctan2(-m_{23},m_{22})\\\theta_2 = \arcsin(m_{21}) \\\theta_3=  \arctan2(-m_{31},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{13},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{13},m_{12}) \f$
+     * EXT_YXZ|\f$ \theta_1 = \arctan2(-m_{31},m_{33}) \\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{12},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_YZX|\f$ \theta_1 = \arctan2(m_{13},m_{11})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{32},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{31},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{23},m_{33}) \f$
+     * EXT_ZXY|\f$ \theta_1 = \arctan2(m_{21},m_{22})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{13},m_{33})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{12},m_{11}) \f$|\f$ \theta_1= 0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{11}) \f$
+     * EXT_ZYX|\f$ \theta_1 = \arctan2(-m_{12},m_{11})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{23},m_{33})\f$|\f$ \theta_1=0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{32},m_{22}) \f$
+     *
+     *  EulerAnglesType  | Ordinary | \f$\theta_2 = 0\f$ | \f$\theta_2 = π\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYX| \f$ \theta_1 = \arctan2(m_{21},-m_{31})\\\theta_2 =\arccos(m_{11}) \\\theta_3 = \arctan2(m_{12},m_{13}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_XZX| \f$ \theta_1 = \arctan2(m_{31},m_{21})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{13},-m_{12}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{32},m_{33})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YXY| \f$ \theta_1 = \arctan2(m_{12},m_{32})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{21},-m_{23}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{31},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YZY| \f$ \theta_1 = \arctan2(m_{32},-m_{12})\\\theta_2 = \arccos(m_{22}) \\\theta_3 =\arctan2(m_{23},m_{21}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{13},-m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZXZ| \f$ \theta_1 = \arctan2(-m_{13},m_{23})\\\theta_2 = \arccos(m_{33}) \\\theta_3 =\arctan2(m_{31},m_{32}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZYZ| \f$ \theta_1 = \arctan2(m_{23},m_{13})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{32},-m_{31}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * EXT_XYX| \f$ \theta_1 = \arctan2(m_{12},m_{13}) \\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{21},-m_{31})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3= \arctan2(m_{23},m_{22}) \f$
+     * EXT_XZX| \f$ \theta_1 = \arctan2(m_{13},-m_{12})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{31},m_{21})\f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{32},m_{33}) \f$
+     * EXT_YXY| \f$ \theta_1 = \arctan2(m_{21},-m_{23})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{12},m_{32}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{31},m_{11}) \f$
+     * EXT_YZY| \f$ \theta_1 = \arctan2(m_{23},m_{21}) \\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{32},-m_{12}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1=0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{13},-m_{11}) \f$
+     * EXT_ZXZ| \f$ \theta_1 = \arctan2(m_{31},m_{32}) \\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(-m_{13},m_{23})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{22}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_ZYZ| \f$ \theta_1 = \arctan2(m_{32},-m_{31})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{23},m_{13}) \f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     *
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+
+    Vec<_Tp, 3> toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType);
+
     _Tp w, x, y, z;
 
 };
@@ -1165,8 +1667,8 @@ Quat<T> exp(const Quat<T> &q);
 template <typename T>
 Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
 
-template <typename T, typename _T>
-Quat<T> power(const Quat<T>& q, _T x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+template <typename T>
+Quat<T> power(const Quat<T>& q, const T x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
 
 template <typename T>
 Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
@@ -1174,11 +1676,11 @@ Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
 template <typename S>
 Quat<S> sqrt(const Quat<S> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
 
-template <typename S, typename T>
-Quat<S> operator*(const T, const Quat<S>&);
+template <typename T>
+Quat<T> operator*(const T, const Quat<T>&);
 
-template <typename S, typename T>
-Quat<S> operator*(const Quat<S>&, const T);
+template <typename T>
+Quat<T> operator*(const Quat<T>&, const T);
 
 template <typename S>
 std::ostream& operator<<(std::ostream&, const Quat<S>&);
diff --git a/modules/core/include/opencv2/core/quaternion.inl.hpp b/modules/core/include/opencv2/core/quaternion.inl.hpp
index 769f53ed4b..3c2fce10af 100644
--- a/modules/core/include/opencv2/core/quaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/quaternion.inl.hpp
@@ -148,6 +148,30 @@ inline Quat<T> Quat<T>::operator+(const Quat<T> &q1) const
     return Quat<T>(w + q1.w, x + q1.x, y + q1.y, z + q1.z);
 }
 
+template <typename T>
+inline Quat<T> operator+(const T a, const Quat<T>& q)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator+(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const T a, const Quat<T>& q)
+{
+    return Quat<T>(a - q.w, -q.x, -q.y, -q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w - a, q.x, q.y, q.z);
+}
+
 template <typename T>
 inline Quat<T> Quat<T>::operator-(const Quat<T> &q1) const
 {
@@ -183,14 +207,14 @@ inline Quat<T> Quat<T>::operator*(const Quat<T> &q1) const
 }
 
 
-template <typename T, typename S>
-Quat<T> operator*(const Quat<T> &q1, const S a)
+template <typename T>
+Quat<T> operator*(const Quat<T> &q1, const T a)
 {
     return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
 }
 
-template <typename T, typename S>
-Quat<T> operator*(const S a, const Quat<T> &q1)
+template <typename T>
+Quat<T> operator*(const T a, const Quat<T> &q1)
 {
     return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
 }
@@ -221,7 +245,7 @@ inline Quat<T>& Quat<T>::operator/=(const Quat<T> &q1)
     return *this;
 }
 template <typename T>
-Quat<T>& Quat<T>::operator*=(const T &q1)
+Quat<T>& Quat<T>::operator*=(const T q1)
 {
     w *= q1;
     x *= q1;
@@ -231,7 +255,7 @@ Quat<T>& Quat<T>::operator*=(const T &q1)
 }
 
 template <typename T>
-inline Quat<T>& Quat<T>::operator/=(const T &a)
+inline Quat<T>& Quat<T>::operator/=(const T a)
 {
     const T a_inv = 1.0 / a;
     w *= a_inv;
@@ -242,7 +266,7 @@ inline Quat<T>& Quat<T>::operator/=(const T &a)
 }
 
 template <typename T>
-inline Quat<T> Quat<T>::operator/(const T &a) const
+inline Quat<T> Quat<T>::operator/(const T a) const
 {
     const T a_inv = 1.0 / a;
     return Quat<T>(w * a_inv, x * a_inv, y * a_inv, z * a_inv);
@@ -353,15 +377,14 @@ Quat<T> Quat<T>::log(QuatAssumeType assumeUnit) const
     return Quat<T>(std::log(qNorm), v[0] * k, v[1] * k, v[2] *k);
 }
 
-template <typename T, typename _T>
-inline Quat<T> power(const Quat<T> &q1, _T alpha, QuatAssumeType assumeUnit)
+template <typename T>
+inline Quat<T> power(const Quat<T> &q1, const T alpha, QuatAssumeType assumeUnit)
 {
     return q1.power(alpha, assumeUnit);
 }
 
 template <typename T>
-template <typename _T>
-inline Quat<T> Quat<T>::power(_T alpha, QuatAssumeType assumeUnit) const
+inline Quat<T> Quat<T>::power(const T alpha, QuatAssumeType assumeUnit) const
 {
     if (x * x + y * y + z * z > CV_QUAT_EPS)
     {
@@ -843,6 +866,197 @@ Quat<T> Quat<T>::spline(const Quat<T> &q0, const Quat<T> &q1, const Quat<T> &q2,
     return squad(vec[1], s1, s2, vec[2], t, assumeUnit, QUAT_ASSUME_NOT_UNIT);
 }
 
+namespace detail {
+
+template <typename T> static
+Quat<T> createFromAxisRot(int axis, const T theta)
+{
+    if (axis == 0)
+        return Quat<T>::createFromXRot(theta);
+    if (axis == 1)
+        return Quat<T>::createFromYRot(theta);
+    if (axis == 2)
+        return Quat<T>::createFromZRot(theta);
+    CV_Assert(0);
+}
+
+inline bool isIntAngleType(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType < QuatEnum::EXT_XYZ;
+}
+
+inline bool isTaitBryan(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType/6 == 1 || eulerAnglesType/6 == 3;
+}
+}  // namespace detail
+
+template <typename T>
+Quat<T> Quat<T>::createFromYRot(const T theta)
+{
+    return Quat<T>{std::cos(theta * 0.5f), 0, std::sin(theta * 0.5f), 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromXRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), std::sin(theta * 0.5f), 0, 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromZRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), 0, 0, std::sin(theta * 0.5f)};
+}
+
+
+template <typename T>
+Quat<T> Quat<T>::createFromEulerAngles(const Vec<T, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType) {
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    static const int rotationAxis[24][3] = {
+        {0, 1, 2}, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+        {0, 1, 2}, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}  ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+    };
+    Quat<T> q1 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][0], angles(0));
+    Quat<T> q2 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][1], angles(1));
+    Quat<T> q3 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][2], angles(2));
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        return q1 * q2 * q3;
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        return q3 * q2 * q1;
+    }
+}
+
+template <typename T>
+Vec<T, 3> Quat<T>::toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType){
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    Matx33d R = toRotMat3x3();
+    enum {
+        C_ZERO,
+        C_PI,
+        C_PI_2,
+        N_CONSTANTS,
+        R_0_0 = N_CONSTANTS, R_0_1, R_0_2,
+        R_1_0, R_1_1, R_1_2,
+        R_2_0, R_2_1, R_2_2
+    };
+    static const T constants_[N_CONSTANTS] = {
+        0,  // C_ZERO
+        (T)CV_PI,  // C_PI
+        (T)(CV_PI * 0.5)  // C_PI_2, -C_PI_2
+    };
+    static const int rotationR_[24][12] = {
+        {+R_0_2,    +R_1_0, +R_1_1, C_PI_2,     +R_2_1, +R_1_1, -C_PI_2,    -R_1_2, +R_2_2,    +R_0_2,    -R_0_1, +R_0_0},  // INT_XYZ
+        {+R_0_1,    -R_1_2, +R_2_2, -C_PI_2,    +R_2_0, +R_2_2, C_PI_2,     +R_2_1, +R_1_1,    -R_0_1,    +R_0_2, +R_0_0},  // INT_XZY
+        {+R_1_2,    -R_0_1, +R_0_0, -C_PI_2,    +R_0_1, +R_0_0, C_PI_2,     +R_0_2, +R_2_2,    -R_1_2,    +R_1_0, +R_1_1},  // INT_YXZ
+        {+R_1_0,    +R_0_2, +R_2_2, C_PI_2,     +R_0_2, +R_0_1, -C_PI_2,    -R_2_0, +R_0_0,    +R_1_0,    -R_1_2, +R_1_1},  // INT_YZX
+        {+R_2_1,    +R_1_0, +R_0_0, C_PI_2,     +R_1_0, +R_0_0, -C_PI_2,    -R_0_1, +R_1_1,    +R_2_1,    -R_2_0, +R_2_2},  // INT_ZXY
+        {+R_2_0,    -R_0_1, +R_1_1, -C_PI_2,    +R_1_2, +R_1_1, C_PI_2,     +R_1_0, +R_0_0,    -R_2_0,    +R_2_1, +R_2_2},  // INT_ZYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     +R_1_2, +R_1_1, C_PI,       +R_1_0, -R_2_0,    +R_0_0,    +R_0_1, +R_0_2},  // INT_XYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     -R_2_1, +R_2_2, C_PI,       +R_2_0, +R_1_0,    +R_0_0,    +R_0_2, -R_0_1},  // INT_XZX
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     -R_2_0, +R_0_0, C_PI,       +R_0_1, +R_2_1,    +R_1_1,    +R_1_0, -R_1_2},  // INT_YXY
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     +R_0_2, -R_0_0, C_PI,       +R_2_1, -R_0_1,    +R_1_1,    +R_1_2, +R_1_0},  // INT_YZY
+        {+R_2_2,    +R_1_0, +R_1_1, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_0_2, -R_1_2,    +R_2_2,    +R_2_0, +R_2_1},  // INT_ZXZ
+        {+R_2_2,    +R_1_0, +R_0_0, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_1_2, +R_0_2,    +R_2_2,    +R_2_1, -R_2_0},  // INT_ZYZ
+
+        {+R_2_0,    -C_PI_2, -R_0_1, +R_1_1,    C_PI_2,  +R_1_2, +R_1_1,    +R_2_1, +R_2_2,    -R_2_0,    +R_1_0, +R_0_0},  // EXT_XYZ
+        {+R_1_0,    C_PI_2,  +R_0_2, +R_2_2,    -C_PI_2, +R_0_2, +R_0_1,    -R_1_2, +R_1_1,    +R_1_0,    -R_2_0, +R_0_0},  // EXT_XZY
+        {+R_2_1,    C_PI_2,  +R_1_0, +R_0_0,    -C_PI_2, +R_1_0, +R_0_0,    -R_2_0, +R_2_2,    +R_2_1,    -R_0_1, +R_1_1},  // EXT_YXZ
+        {+R_0_2,    -C_PI_2, -R_1_2, +R_2_2,    C_PI_2,  +R_2_0, +R_2_2,    +R_0_2, +R_0_0,    -R_0_1,    +R_2_1, +R_1_1},  // EXT_YZX
+        {+R_1_2,    -C_PI_2, -R_0_1, +R_0_0,    C_PI_2,  +R_0_1, +R_0_0,    +R_1_0, +R_1_1,    -R_1_2,    +R_0_2, +R_2_2},  // EXT_ZXY
+        {+R_0_2,    C_PI_2,  +R_1_0, +R_1_1,    -C_PI_2, +R_2_1, +R_1_1,    -R_0_1, +R_0_0,    +R_0_2,    -R_1_2, +R_2_2},  // EXT_ZYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_1_2, +R_1_1,    +R_0_1, +R_0_2,    +R_0_0,    +R_1_0, -R_2_0},  // EXT_XYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_2_1, +R_2_2,    +R_0_2, -R_0_1,    +R_0_0,    +R_2_0, +R_1_0},  // EXT_XZX
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    -R_2_0, +R_0_0,    +R_1_0, -R_1_2,    +R_1_1,    +R_0_1, +R_2_1},  // EXT_YXY
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    +R_0_2, -R_0_0,    +R_1_2, +R_1_0,    +R_1_1,    +R_2_1, -R_0_1},  // EXT_YZY
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_1_1,    C_PI,    +R_1_0, +R_0_0,    +R_2_0, +R_2_1,    +R_2_2,    +R_0_2, -R_1_2},  // EXT_ZXZ
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_0_0,    C_PI,    +R_1_0, +R_0_0,    +R_2_1, -R_2_0,    +R_2_2,    +R_1_2, +R_0_2},  // EXT_ZYZ
+    };
+    T rotationR[12];
+    for (int i = 0; i < 12; i++)
+    {
+        int id = rotationR_[eulerAnglesType][i];
+        unsigned idx = std::abs(id);
+        T value = 0.0f;
+        if (idx < N_CONSTANTS)
+        {
+            value = constants_[idx];
+        }
+        else
+        {
+            unsigned r_idx = idx - N_CONSTANTS;
+            CV_DbgAssert(r_idx < 9);
+            value = R.val[r_idx];
+        }
+        bool isNegative = id < 0;
+        if (isNegative)
+            value = -value;
+        rotationR[i] = value;
+    }
+    Vec<T, 3> angles;
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[1], rotationR[2]), rotationR[3], 0};
+            return angles;
+        }
+        else if(abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[4], rotationR[5]), rotationR[6], 0};
+            return angles;
+        }
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[1], std::atan2(rotationR[2], rotationR[3])};
+            return angles;
+        }
+        else if (abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[4], std::atan2(rotationR[5], rotationR[6])};
+            return angles;
+        }
+    }
+
+    angles(0) = std::atan2(rotationR[7], rotationR[8]);
+    if (detail::isTaitBryan(eulerAnglesType))
+        angles(1) = std::acos(rotationR[9]);
+    else
+        angles(1) = std::asin(rotationR[9]);
+    angles(2) = std::atan2(rotationR[10], rotationR[11]);
+    return angles;
+}
+
 }  // namepsace
 //! @endcond
 
diff --git a/modules/core/include/opencv2/core/simd_intrinsics.hpp b/modules/core/include/opencv2/core/simd_intrinsics.hpp
index c50923f0ef..8fe7ee6b60 100644
--- a/modules/core/include/opencv2/core/simd_intrinsics.hpp
+++ b/modules/core/include/opencv2/core/simd_intrinsics.hpp
@@ -40,7 +40,6 @@ Notes:
 #endif
 
 #include "opencv2/core/cvdef.h"
-#include "opencv2/core/version.hpp"
 
 #ifdef OPENCV_SIMD_CONFIG_HEADER
 #include CVAUX_STR(OPENCV_SIMD_CONFIG_HEADER)
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index c52abbbff4..f0368027aa 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -570,6 +570,8 @@ static inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type);
 /////////////////////////////// Parallel Primitives //////////////////////////////////
 
 /** @brief Base class for parallel data processors
+
+@ingroup core_parallel
 */
 class CV_EXPORTS ParallelLoopBody
 {
@@ -579,17 +581,23 @@ public:
 };
 
 /** @brief Parallel data processor
+
+@ingroup core_parallel
 */
 CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);
 
+//! @ingroup core_parallel
 class ParallelLoopBodyLambdaWrapper : public ParallelLoopBody
 {
 private:
     std::function<void(const Range&)> m_functor;
 public:
-    ParallelLoopBodyLambdaWrapper(std::function<void(const Range&)> functor) :
-        m_functor(functor)
-    { }
+    inline
+    ParallelLoopBodyLambdaWrapper(std::function<void(const Range&)> functor)
+        : m_functor(functor)
+    {
+        // nothing
+    }
 
     virtual void operator() (const cv::Range& range) const CV_OVERRIDE
     {
@@ -597,11 +605,14 @@ public:
     }
 };
 
-inline void parallel_for_(const Range& range, std::function<void(const Range&)> functor, double nstripes=-1.)
+//! @ingroup core_parallel
+static inline
+void parallel_for_(const Range& range, std::function<void(const Range&)> functor, double nstripes=-1.)
 {
     parallel_for_(range, ParallelLoopBodyLambdaWrapper(functor), nstripes);
 }
 
+
 /////////////////////////////// forEach method of cv::Mat ////////////////////////////
 template<typename _Tp, typename Functor> inline
 void Mat::forEach_impl(const Functor& operation) {
diff --git a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
index 61fcf15977..eb5ecde16b 100644
--- a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
+++ b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -7,13 +7,11 @@
 
 #include "./allocator_stats.hpp"
 
-#ifdef CV_CXX11
-#include <atomic>
-#endif
-
 //#define OPENCV_DISABLE_ALLOCATOR_STATS
 
-namespace cv { namespace utils {
+#ifdef CV_CXX11
+
+#include <atomic>
 
 #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
 #if defined(__GNUC__) && (\
@@ -28,6 +26,16 @@ namespace cv { namespace utils {
 #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
 #endif
 
+#else  // CV_CXX11
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
+#endif
+
+#endif  // CV_CXX11
+
+namespace cv { namespace utils {
+
 #ifdef CV__ALLOCATOR_STATS_LOG
 namespace {
 #endif
diff --git a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
new file mode 100644
index 0000000000..bc3ae4d08a
--- /dev/null
+++ b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_PLUGIN_LOADER_HPP
+#define OPENCV_UTILS_PLUGIN_LOADER_HPP
+
+#include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/core/utils/filesystem.private.hpp"
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#include <dlfcn.h>
+#endif
+
+namespace cv { namespace plugin { namespace impl {
+
+#if defined(_WIN32)
+typedef HMODULE LibHandle_t;
+typedef wchar_t FileSystemChar_t;
+typedef std::wstring FileSystemPath_t;
+
+// TODO wchar_t <=> UTF-8
+static inline
+FileSystemPath_t toFileSystemPath(const std::string& p)
+{
+    FileSystemPath_t result;
+    result.resize(p.size());
+    for (size_t i = 0; i < p.size(); i++)
+        result[i] = (wchar_t)p[i];
+    return result;
+}
+
+// TODO wchar_t <=> UTF-8
+static inline
+std::string toPrintablePath(const FileSystemPath_t& p)
+{
+    std::string result;
+    result.resize(p.size());
+    for (size_t i = 0; i < p.size(); i++)
+    {
+        wchar_t ch = p[i];
+        if ((int)ch >= ' ' && (int)ch < 128)
+            result[i] = (char)ch;
+        else
+            result[i] = '?';
+    }
+    return result;
+}
+#else  // !_WIN32
+typedef void* LibHandle_t;
+typedef char FileSystemChar_t;
+typedef std::string FileSystemPath_t;
+
+static inline FileSystemPath_t toFileSystemPath(const std::string& p) { return p; }
+static inline std::string toPrintablePath(const FileSystemPath_t& p) { return p; }
+#endif
+
+
+static inline
+void* getSymbol_(LibHandle_t h, const char* symbolName)
+{
+#if defined(_WIN32)
+    return (void*)GetProcAddress(h, symbolName);
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+    return dlsym(h, symbolName);
+#endif
+}
+
+static inline
+LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
+{
+#if defined(_WIN32)
+# ifdef WINRT
+    return LoadPackagedLibrary(filename.c_str(), 0);
+# else
+    return LoadLibraryW(filename.c_str());
+#endif
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+    return dlopen(filename.c_str(), RTLD_NOW);
+#endif
+}
+
+static inline
+void libraryRelease_(LibHandle_t h)
+{
+#if defined(_WIN32)
+    FreeLibrary(h);
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+    dlclose(h);
+#endif
+}
+
+static inline
+std::string libraryPrefix()
+{
+#if defined(_WIN32)
+    return "";
+#else
+    return "lib";
+#endif
+}
+static inline
+std::string librarySuffix()
+{
+#if defined(_WIN32)
+    const char* suffix = ""
+        CVAUX_STR(CV_MAJOR_VERSION) CVAUX_STR(CV_MINOR_VERSION) CVAUX_STR(CV_SUBMINOR_VERSION)
+    #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
+        "_64"
+    #endif
+    #if defined(_DEBUG) && defined(DEBUG_POSTFIX)
+        CVAUX_STR(DEBUG_POSTFIX)
+    #endif
+        ".dll";
+    return suffix;
+#else
+    return ".so";
+#endif
+}
+
+
+//============================
+
+class CV_EXPORTS DynamicLib
+{
+private:
+    LibHandle_t handle;
+    const FileSystemPath_t fname;
+    bool disableAutoUnloading_;
+
+public:
+    DynamicLib(const FileSystemPath_t& filename);
+    ~DynamicLib();
+    /** Do not automatically unload library in destructor */
+    inline void disableAutomaticLibraryUnloading()
+    {
+        disableAutoUnloading_ = true;
+    }
+    inline bool isLoaded() const
+    {
+        return handle != NULL;
+    }
+    void* getSymbol(const char* symbolName) const;
+    const std::string getName() const;
+private:
+    void libraryLoad(const FileSystemPath_t& filename);
+    void libraryRelease();
+
+private:
+    DynamicLib(const DynamicLib &) = delete;
+    DynamicLib &operator=(const DynamicLib &) = delete;
+};
+
+
+}}}  // namespace
+
+#endif  // OPENCV_HAVE_FILESYSTEM_SUPPORT
+
+#endif  // OPENCV_UTILS_PLUGIN_LOADER_HPP
diff --git a/modules/core/include/opencv2/core/utils/tls.hpp b/modules/core/include/opencv2/core/utils/tls.hpp
index 697a7b0340..124caebc85 100644
--- a/modules/core/include/opencv2/core/utils/tls.hpp
+++ b/modules/core/include/opencv2/core/utils/tls.hpp
@@ -5,7 +5,9 @@
 #ifndef OPENCV_UTILS_TLS_HPP
 #define OPENCV_UTILS_TLS_HPP
 
-#include <opencv2/core/utility.hpp>
+#ifndef OPENCV_CORE_UTILITY_H
+#error "tls.hpp must be included after opencv2/core/utility.hpp or opencv2/core.hpp"
+#endif
 
 namespace cv {
 
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
index 08ae890175..68863ffb36 100644
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -497,13 +497,15 @@ VSX_IMPL_CONV_EVEN_2_4(vec_uint4,  vec_double2, vec_ctu, vec_ctuo)
     VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
     {                                                   \
         assert(only_truncate == 0);                     \
-        CV_UNUSED(only_truncate);                            \
+        CV_UNUSED(only_truncate);                       \
         return fn2(a);                                  \
     }
     VSX_IMPL_CONV_2VARIANT(vec_int4,   vec_float4,  vec_cts, vec_cts)
+    VSX_IMPL_CONV_2VARIANT(vec_uint4,  vec_float4,  vec_ctu, vec_ctu)
     VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4,    vec_ctf, vec_ctf)
+    VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4,   vec_ctf, vec_ctf)
     // define vec_cts for converting double precision to signed doubleword
-    // which isn't combitable with xlc but its okay since Eigen only use it for gcc
+    // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
     VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
 #endif // Eigen
 
diff --git a/modules/core/misc/java/test/MatTest.java b/modules/core/misc/java/test/MatTest.java
index 039aa39929..00e7b7cb32 100644
--- a/modules/core/misc/java/test/MatTest.java
+++ b/modules/core/misc/java/test/MatTest.java
@@ -455,6 +455,27 @@ public class MatTest extends OpenCVTestCase {
         bytesNum = sm.get(1, 1, buff11);
         assertEquals(4, bytesNum);
         assertTrue(Arrays.equals(new short[] {340, 341, 0, 0}, buff11));
+
+        Mat m2 = new Mat(new int[]{ 5, 6, 8 }, CvType.CV_16S);
+        short[] data = new short[(int)m2.total()];
+        for (int i = 0; i < data.length; i++ ) {
+            data[i] = (short)i;
+        }
+        m2.put(new int[] {0, 0, 0}, data);
+        Mat matNonContinuous = m2.submat(new Range[]{new Range(1,4), new Range(2,5), new Range(3,6)});
+        Mat matContinuous = matNonContinuous.clone();
+        short[] outNonContinuous = new short[(int)matNonContinuous.total()];
+        matNonContinuous.get(new int[] { 0, 0, 0 }, outNonContinuous);
+        short[] outContinuous = new short[(int)matNonContinuous.total()];
+        matContinuous.get(new int[] { 0, 0, 0 }, outContinuous);
+        assertArrayEquals(outNonContinuous, outContinuous);
+        Mat subMat2 = m2.submat(new Range[]{new Range(1,4), new Range(1,5), new Range(0,8)});
+        Mat subMatClone2 = subMat2.clone();
+        short[] outNonContinuous2 = new short[(int)subMat2.total()];
+        subMat2.get(new int[] { 0, 1, 1 }, outNonContinuous2);
+        short[] outContinuous2 = new short[(int)subMat2.total()];
+        subMatClone2.get(new int[] { 0, 1, 1 }, outContinuous2);
+        assertArrayEquals(outNonContinuous2, outContinuous2);
     }
 
     public void testGetNativeObjAddr() {
diff --git a/modules/core/misc/plugins/parallel_openmp/CMakeLists.txt b/modules/core/misc/plugins/parallel_openmp/CMakeLists.txt
new file mode 100644
index 0000000000..024d2046cf
--- /dev/null
+++ b/modules/core/misc/plugins/parallel_openmp/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.5)
+project(opencv_core_parallel_openmp CXX)
+
+get_filename_component(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../../.." ABSOLUTE)
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVPluginStandalone.cmake")
+
+# scan dependencies
+set(WITH_OPENMP ON)
+include("${OpenCV_SOURCE_DIR}/modules/core/cmake/parallel/init.cmake")
+
+message(STATUS "OpenMP: ${OpenMP_CXX_VERSION}")
+ocv_create_plugin(core "opencv_core_parallel_openmp" "ocv.3rdparty.openmp" "OPENMP" "src/parallel/parallel_openmp.cpp")
diff --git a/modules/core/misc/plugins/parallel_tbb/CMakeLists.txt b/modules/core/misc/plugins/parallel_tbb/CMakeLists.txt
new file mode 100644
index 0000000000..c2129c7c2c
--- /dev/null
+++ b/modules/core/misc/plugins/parallel_tbb/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.5)
+project(opencv_core_parallel_tbb CXX)
+
+get_filename_component(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../../.." ABSOLUTE)
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVPluginStandalone.cmake")
+
+# scan dependencies
+set(WITH_TBB ON)
+include("${OpenCV_SOURCE_DIR}/modules/core/cmake/parallel/init.cmake")
+
+message(STATUS "TBB: ver ${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR} interface ${TBB_INTERFACE_VERSION}")
+ocv_create_plugin(core "opencv_core_parallel_tbb" "ocv.3rdparty.tbb" "TBB" "src/parallel/parallel_tbb.cpp")
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 9f5f6e9e77..0cbfc2d653 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -678,7 +678,12 @@ OCL_PERF_TEST_P(SqrtFixture, Sqrt, ::testing::Combine(
 
     OCL_TEST_CYCLE() cv::sqrt(src, dst);
 
-    if (CV_MAT_DEPTH(type) >= CV_32F)
+    // To square root 32 bit floats we use native_sqrt, which has implementation
+    // defined accuracy. We know intel devices have accurate native_sqrt, but
+    // otherwise stick to a relaxed sanity check. For types larger than 32 bits
+    // we can do the accuracy check for all devices as normal.
+    if (CV_MAT_DEPTH(type) > CV_32F || !ocl::useOpenCL() ||
+        ocl::Device::getDefault().isIntel())
         SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
     else
         SANITY_CHECK(dst, 1);
diff --git a/modules/core/perf/perf_allocation.cpp b/modules/core/perf/perf_allocation.cpp
new file mode 100755
index 0000000000..2f3bf3eaa7
--- /dev/null
+++ b/modules/core/perf/perf_allocation.cpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <array>
+
+using namespace perf;
+
+#define ALLOC_MAT_SIZES ::perf::szSmall24, ::perf::szSmall32, ::perf::szSmall64, \
+    ::perf::sz5MP, ::perf::sz2K, ::perf::szSmall128, ::perf::szODD, ::perf::szQVGA, \
+    ::perf::szVGA, ::perf::szSVGA, ::perf::sz720p, ::perf::sz1080p, ::perf::sz2160p, \
+    ::perf::sz4320p, ::perf::sz3MP, ::perf::szXGA, ::perf::szSXGA, ::perf::szWQHD, \
+    ::perf::sznHD, ::perf::szqHD
+
+namespace opencv_test
+{
+
+typedef perf::TestBaseWithParam<MatType> MatDepth_tb;
+
+PERF_TEST_P(MatDepth_tb, DISABLED_Allocation_Aligned,
+    testing::Values(CV_8UC1, CV_16SC1, CV_8UC3, CV_8UC4))
+{
+    const int matType = GetParam();
+    const cv::Mat utility(1, 1, matType);
+    const size_t elementBytes = utility.elemSize();
+
+    const std::array<cv::Size, 20> sizes{ALLOC_MAT_SIZES};
+    std::array<size_t, 20> bytes;
+    for (size_t i = 0; i < sizes.size(); ++i)
+    {
+        bytes[i] = sizes[i].width * sizes[i].height * elementBytes;
+    }
+
+    declare.time(60)
+           .iterations(100);
+
+    TEST_CYCLE()
+    {
+        for (int i = 0; i < 100000; ++i)
+        {
+            fastFree(fastMalloc(bytes[i % sizes.size()]));
+        }
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+};
diff --git a/modules/core/src/alloc.cpp b/modules/core/src/alloc.cpp
index 98012998fc..a0def9db2e 100644
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@@ -82,7 +82,7 @@ cv::utils::AllocatorStatisticsInterface& getAllocatorStatistics()
     return allocator_stats;
 }
 
-#if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN
+#if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN || defined HAVE_WIN32_ALIGNED_MALLOC
 static bool readMemoryAlignmentParameter()
 {
     bool value = true;
@@ -100,25 +100,27 @@ static bool readMemoryAlignmentParameter()
     // TODO add checks for valgrind, ASAN if value == false
     return value;
 }
+
+#if defined _MSC_VER
+#pragma warning(suppress:4714)  // preventive: const marked as __forceinline not inlined
+static __forceinline
+#else
 static inline
+#endif
 bool isAlignedAllocationEnabled()
 {
-    static bool initialized = false;
-    static bool useMemalign = true;
-    if (!initialized)
-    {
-        initialized = true;  // trick to avoid stuck in acquire (works only if allocations are scope based)
-        useMemalign = readMemoryAlignmentParameter();
-    }
+    // use construct on first use idiom https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+    // details: https://github.com/opencv/opencv/issues/15691
+    static bool useMemalign = readMemoryAlignmentParameter();
     return useMemalign;
 }
-// do not use variable directly, details: https://github.com/opencv/opencv/issues/15691
+
+// need for this static const is disputed; retaining as it doesn't cause harm
 static const bool g_force_initialization_memalign_flag
 #if defined __GNUC__
     __attribute__((unused))
 #endif
     = isAlignedAllocationEnabled();
-
 #endif
 
 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
@@ -146,6 +148,14 @@ void* fastMalloc(size_t size)
             return OutOfMemoryError(size);
         return ptr;
     }
+#elif defined HAVE_WIN32_ALIGNED_MALLOC
+    if (isAlignedAllocationEnabled())
+    {
+        void* ptr = _aligned_malloc(size, CV_MALLOC_ALIGN);
+        if(!ptr)
+            return OutOfMemoryError(size);
+        return ptr;
+    }
 #endif
     uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
     if(!udata)
@@ -168,6 +178,12 @@ void fastFree(void* ptr)
         free(ptr);
         return;
     }
+#elif defined HAVE_WIN32_ALIGNED_MALLOC
+    if (isAlignedAllocationEnabled())
+    {
+        _aligned_free(ptr);
+        return;
+    }
 #endif
     if(ptr)
     {
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 1c95985e9a..27acaf1bbf 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -57,26 +57,6 @@ namespace cv
 *                                   logical operations                                   *
 \****************************************************************************************/
 
-void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
-{
-    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
-    size_t esz = CV_ELEM_SIZE(buftype);
-    BinaryFunc cvtFn = getConvertFunc(sc.depth(), buftype);
-    CV_Assert(cvtFn);
-    cvtFn(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
-    // unroll the scalar
-    if( scn < cn )
-    {
-        CV_Assert( scn == 1 );
-        size_t esz1 = CV_ELEM_SIZE1(buftype);
-        for( size_t i = esz1; i < esz; i++ )
-            scbuf[i] = scbuf[i - esz1];
-    }
-    for( size_t i = esz; i < blocksize*esz; i++ )
-        scbuf[i] = scbuf[i - esz];
-}
-
-
 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
        OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
@@ -647,7 +627,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
         (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
     {
-        if( checkScalar(*psrc1, type2, kind1, kind2) )
+        if ((type1 == CV_64F && (sz1.height == 1 || sz1.height == 4)) &&
+            checkScalar(*psrc1, type2, kind1, kind2))
         {
             // src1 is a scalar; swap it with src2
             swap(psrc1, psrc2);
@@ -1002,9 +983,7 @@ static BinaryFuncC* getRecipTab()
     return recipTab;
 }
 
-}
-
-void cv::multiply(InputArray src1, InputArray src2,
+void multiply(InputArray src1, InputArray src2,
                   OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1013,7 +992,7 @@ void cv::multiply(InputArray src1, InputArray src2,
               true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
 }
 
-void cv::divide(InputArray src1, InputArray src2,
+void divide(InputArray src1, InputArray src2,
                 OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1021,7 +1000,7 @@ void cv::divide(InputArray src1, InputArray src2,
     arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
 }
 
-void cv::divide(double scale, InputArray src2,
+void divide(double scale, InputArray src2,
                 OutputArray dst, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1029,13 +1008,17 @@ void cv::divide(double scale, InputArray src2,
     arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
 }
 
+UMat UMat::mul(InputArray m, double scale) const
+{
+    UMat dst;
+    multiply(*this, m, dst, scale);
+    return dst;
+}
+
 /****************************************************************************************\
 *                                      addWeighted                                       *
 \****************************************************************************************/
 
-namespace cv
-{
-
 static BinaryFuncC* getAddWeightedTab()
 {
     static BinaryFuncC addWeightedTab[] =
@@ -1849,6 +1832,9 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
     }
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 /****************************************************************************************\
 *                                Earlier API: cvAdd etc.                                 *
 \****************************************************************************************/
@@ -2008,4 +1994,5 @@ cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
     cv::compare( src1, value, dst, cmp_op );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index b2f20e41c9..a9ddefef4c 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -48,6 +48,8 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 #define  CV_ORIGIN_TL  0
 #define  CV_ORIGIN_BL  1
 
@@ -2748,53 +2750,6 @@ void DefaultDeleter<CvMatND>::operator ()(CvMatND* obj) const { cvReleaseMatND(&
 void DefaultDeleter<CvSparseMat>::operator ()(CvSparseMat* obj) const { cvReleaseSparseMat(&obj); }
 void DefaultDeleter<CvMemStorage>::operator ()(CvMemStorage* obj) const { cvReleaseMemStorage(&obj); }
 
-template <typename T> static inline
-void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
-{
-    int i = 0;
-    for(; i < cn; i++)
-        buf[i] = saturate_cast<T>(s.val[i]);
-    for(; i < unroll_to; i++)
-        buf[i] = buf[i-cn];
-}
-
-void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
-{
-    CV_INSTRUMENT_REGION();
-
-    const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    CV_Assert(cn <= 4);
-    switch(depth)
-    {
-    case CV_8U:
-        scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
-        break;
-    case CV_8S:
-        scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
-        break;
-    case CV_16U:
-        scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
-        break;
-    case CV_16S:
-        scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
-        break;
-    case CV_32S:
-        scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
-        break;
-    case CV_32F:
-        scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
-        break;
-    case CV_64F:
-        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
-        break;
-    case CV_16F:
-        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
-        break;
-    default:
-        CV_Error(CV_StsUnsupportedFormat,"");
-    }
-}
-
 } // cv::
 
 
@@ -2817,4 +2772,5 @@ cvRelease( void** struct_ptr )
 }
 
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/bindings_utils.cpp b/modules/core/src/bindings_utils.cpp
index 050b7247f8..78716c21f6 100644
--- a/modules/core/src/bindings_utils.cpp
+++ b/modules/core/src/bindings_utils.cpp
@@ -5,6 +5,8 @@
 #include "precomp.hpp"
 #include "opencv2/core/bindings_utils.hpp"
 #include <sstream>
+#include <opencv2/core/utils/filesystem.hpp>
+#include <opencv2/core/utils/filesystem.private.hpp>
 
 namespace cv { namespace utils {
 
@@ -208,4 +210,15 @@ CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argume
     return ss.str();
 }
 
+namespace fs {
+cv::String getCacheDirectoryForDownloads()
+{
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
+    return cv::utils::fs::getCacheDirectory("downloads", "OPENCV_DOWNLOADS_CACHE_DIR");
+#else
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif
+}
+} // namespace fs
+
 }} // namespace
diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp
index bc8340b687..345b4624cb 100644
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@@ -154,7 +154,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
                               sdepth == CV_32F ? "half" : "float",
                               rowsPerWI,
                               sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
-    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/convert_c.cpp b/modules/core/src/convert_c.cpp
index efe4de740a..96beffccc6 100644
--- a/modules/core/src/convert_c.cpp
+++ b/modules/core/src/convert_c.cpp
@@ -5,6 +5,7 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
 
 CV_IMPL void
 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
@@ -132,3 +133,5 @@ CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/convert_scale.dispatch.cpp b/modules/core/src/convert_scale.dispatch.cpp
index 83376aa61d..6902ecc24b 100644
--- a/modules/core/src/convert_scale.dispatch.cpp
+++ b/modules/core/src/convert_scale.dispatch.cpp
@@ -9,7 +9,6 @@
 #include "convert_scale.simd.hpp"
 #include "convert_scale.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-
 namespace cv
 {
 
@@ -117,143 +116,4 @@ void convertScaleAbs(InputArray _src, OutputArray _dst, double alpha, double bet
     }
 }
 
-//==================================================================================================
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
-                           double scale, double delta )
-{
-    UMat src = _src.getUMat();
-
-    if( _mask.empty() )
-        src.convertTo( _dst, dtype, scale, delta );
-    else if (src.channels() <= 4)
-    {
-        const ocl::Device & dev = ocl::Device::getDefault();
-
-        int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
-                ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
-                rowsPerWI = dev.isIntel() ? 4 : 1;
-
-        float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
-        bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
-                haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
-                haveDelta = std::fabs(delta) > DBL_EPSILON,
-                doubleSupport = dev.doubleFPConfig() > 0;
-
-        if (!haveScale && !haveDelta && stype == dtype)
-        {
-            _src.copyTo(_dst, _mask);
-            return true;
-        }
-        if (haveZeroScale)
-        {
-            _dst.setTo(Scalar(delta), _mask);
-            return true;
-        }
-
-        if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
-            return false;
-
-        char cvt[2][40];
-        String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
-                             " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
-                             ocl::typeToStr(stype), ocl::typeToStr(dtype),
-                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
-                             rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
-                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                             haveScale ? " -D HAVE_SCALE" : "",
-                             haveDelta ? " -D HAVE_DELTA" : "",
-                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
-
-        ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
-        if (k.empty())
-            return false;
-
-        UMat mask = _mask.getUMat(), dst = _dst.getUMat();
-
-        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
-                dstarg = ocl::KernelArg::ReadWrite(dst);
-
-        if (haveScale)
-        {
-            if (haveDelta)
-                k.args(srcarg, maskarg, dstarg, fscale, fdelta);
-            else
-                k.args(srcarg, maskarg, dstarg, fscale);
-        }
-        else
-        {
-            if (haveDelta)
-                k.args(srcarg, maskarg, dstarg, fdelta);
-            else
-                k.args(srcarg, maskarg, dstarg);
-        }
-
-        size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
-        return k.run(2, globalsize, NULL, false);
-    }
-    else
-    {
-        UMat temp;
-        src.convertTo( temp, dtype, scale, delta );
-        temp.copyTo( _dst, _mask );
-    }
-
-    return true;
-}
-
-#endif
-
-void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
-               int norm_type, int rtype, InputArray _mask)
-{
-    CV_INSTRUMENT_REGION();
-
-    double scale = 1, shift = 0;
-    int type = _src.type(), depth = CV_MAT_DEPTH(type);
-
-    if( rtype < 0 )
-        rtype = _dst.fixedType() ? _dst.depth() : depth;
-
-    if( norm_type == CV_MINMAX )
-    {
-        double smin = 0, smax = 0;
-        double dmin = MIN( a, b ), dmax = MAX( a, b );
-        minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
-        scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
-        if( rtype == CV_32F )
-        {
-            scale = (float)scale;
-            shift = (float)dmin - (float)(smin*scale);
-        }
-        else
-            shift = dmin - smin*scale;
-    }
-    else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
-    {
-        scale = norm( _src, norm_type, _mask );
-        scale = scale > DBL_EPSILON ? a/scale : 0.;
-        shift = 0;
-    }
-    else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
-
-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
-
-    Mat src = _src.getMat();
-    if( _mask.empty() )
-        src.convertTo( _dst, rtype, scale, shift );
-    else
-    {
-        Mat temp;
-        src.convertTo( temp, rtype, scale, shift );
-        temp.copyTo( _dst, _mask );
-    }
-}
-
 } // namespace
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index c1e86c6a9c..a6f06a5c7d 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -53,6 +53,72 @@
 namespace cv
 {
 
+template <typename T> static inline
+void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
+{
+    int i = 0;
+    for(; i < cn; i++)
+        buf[i] = saturate_cast<T>(s.val[i]);
+    for(; i < unroll_to; i++)
+        buf[i] = buf[i-cn];
+}
+
+void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
+{
+    CV_INSTRUMENT_REGION();
+
+    const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(cn <= 4);
+    switch(depth)
+    {
+    case CV_8U:
+        scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
+        break;
+    case CV_8S:
+        scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
+        break;
+    case CV_16U:
+        scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
+        break;
+    case CV_16S:
+        scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
+        break;
+    case CV_32S:
+        scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
+        break;
+    case CV_32F:
+        scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
+        break;
+    case CV_64F:
+        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
+        break;
+    case CV_16F:
+        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        break;
+    default:
+        CV_Error(CV_StsUnsupportedFormat,"");
+    }
+}
+
+void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
+{
+    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
+    size_t esz = CV_ELEM_SIZE(buftype);
+    BinaryFunc cvtFn = getConvertFunc(sc.depth(), buftype);
+    CV_Assert(cvtFn);
+    cvtFn(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
+    // unroll the scalar
+    if( scn < cn )
+    {
+        CV_Assert( scn == 1 );
+        size_t esz1 = CV_ELEM_SIZE1(buftype);
+        for( size_t i = esz1; i < esz; i++ )
+            scbuf[i] = scbuf[i - esz1];
+    }
+    for( size_t i = esz; i < blocksize*esz; i++ )
+        scbuf[i] = scbuf[i - esz];
+}
+
 template<typename T> static void
 copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
 {
@@ -594,490 +660,6 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
     return *this;
 }
 
-#if CV_SIMD128
-template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-    typedef typename V::lane_type T;
-    int end = (int)(size.width*esz);
-    int width = (end + 1)/2;
-    int width_1 = width & -v_uint8x16::nlanes;
-    int i, j;
-
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(T)>(src, dst));
-#endif
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
-        {
-            V t0, t1;
-
-            t0 = v_load((T*)((uchar*)src + i));
-            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
-            t0 = v_reverse(t0);
-            t1 = v_reverse(t1);
-            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
-            v_store((T*)(dst + i), t1);
-        }
-        if (isAligned<sizeof(T)>(src, dst))
-        {
-            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
-            {
-                T t0, t1;
-
-                t0 = *((T*)((uchar*)src + i));
-                t1 = *((T*)((uchar*)src + j - sizeof(T)));
-                *((T*)(dst + j - sizeof(T))) = t0;
-                *((T*)(dst + i)) = t1;
-            }
-        }
-        else
-        {
-            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
-            {
-                for (int k = 0; k < (int)sizeof(T); k++)
-                {
-                    uchar t0, t1;
-
-                    t0 = *((uchar*)src + i + k);
-                    t1 = *((uchar*)src + j + k - sizeof(T));
-                    *(dst + j + k - sizeof(T)) = t0;
-                    *(dst + i + k) = t1;
-                }
-            }
-        }
-    }
-}
-
-template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-    int end = (int)(size.width*esz);
-    int width = (end + 1)/2;
-
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(T1)>(src, dst));
-    CV_Assert(isAligned<sizeof(T2)>(src, dst));
-#endif
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
-        {
-            T1 t0, t1;
-            T2 t2, t3;
-
-            t0 = *((T1*)((uchar*)src + i));
-            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
-            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
-            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
-            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
-            *((T2*)(dst + j - sizeof(T2))) = t2;
-            *((T1*)(dst + i)) = t1;
-            *((T2*)(dst + i + sizeof(T1))) = t3;
-        }
-    }
-}
-#endif
-
-static void
-flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-#if CV_SIMD
-#if CV_STRONG_ALIGNMENT
-    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
-#endif
-    if (esz == 2 * v_uint8x16::nlanes)
-    {
-        int end = (int)(size.width*esz);
-        int width = end/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
-            {
-#if CV_SIMD256
-                v_uint8x32 t0, t1;
-
-                t0 = v256_load((uchar*)src + i);
-                t1 = v256_load((uchar*)src + j);
-                v_store(dst + j, t0);
-                v_store(dst + i, t1);
-#else
-                v_uint8x16 t0, t1, t2, t3;
-
-                t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
-                t2 = v_load((uchar*)src + j);
-                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
-                v_store(dst + j, t0);
-                v_store(dst + j + v_uint8x16::nlanes, t1);
-                v_store(dst + i, t2);
-                v_store(dst + i + v_uint8x16::nlanes, t3);
-#endif
-            }
-        }
-    }
-    else if (esz == v_uint8x16::nlanes)
-    {
-        int end = (int)(size.width*esz);
-        int width = end/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
-            {
-                v_uint8x16 t0, t1;
-
-                t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + j);
-                v_store(dst + j, t0);
-                v_store(dst + i, t1);
-            }
-        }
-    }
-    else if (esz == 8
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(uint64)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 4
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(unsigned)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 2
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(ushort)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 1)
-    {
-        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 24
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(uint64_t)>(alignmentMark)
-#endif
-    )
-    {
-        int end = (int)(size.width*esz);
-        int width = (end + 1)/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
-            {
-                v_uint8x16 t0, t1;
-                uint64_t t2, t3;
-
-                t0 = v_load((uchar*)src + i);
-                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
-                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
-                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
-                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
-                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
-                v_store(dst + i, t1);
-                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
-            }
-        }
-    }
-#if !CV_STRONG_ALIGNMENT
-    else if (esz == 12)
-    {
-        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 6)
-    {
-        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 3)
-    {
-        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
-    }
-#endif
-    else
-#endif // CV_SIMD
-    {
-        int i, j, limit = (int)(((size.width + 1)/2)*esz);
-        AutoBuffer<int> _tab(size.width*esz);
-        int* tab = _tab.data();
-
-        for( i = 0; i < size.width; i++ )
-            for( size_t k = 0; k < esz; k++ )
-                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( i = 0; i < limit; i++ )
-            {
-                j = tab[i];
-                uchar t0 = src[i], t1 = src[j];
-                dst[i] = t1; dst[j] = t0;
-            }
-        }
-    }
-}
-
-static void
-flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, size_t esz )
-{
-    const uchar* src1 = src0 + (size.height - 1)*sstep;
-    uchar* dst1 = dst0 + (size.height - 1)*dstep;
-    size.width *= (int)esz;
-
-    for( int y = 0; y < (size.height + 1)/2; y++, src0 += sstep, src1 -= sstep,
-                                                  dst0 += dstep, dst1 -= dstep )
-    {
-        int i = 0;
-#if CV_SIMD
-#if CV_STRONG_ALIGNMENT
-        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
-#endif
-        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
-            {
-                v_int32 t0 = vx_load((int*)(src0 + i));
-                v_int32 t1 = vx_load((int*)(src1 + i));
-                vx_store((int*)(dst0 + i), t1);
-                vx_store((int*)(dst1 + i), t0);
-            }
-        }
-#if CV_STRONG_ALIGNMENT
-        else
-        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
-            {
-                v_uint8 t0 = vx_load(src0 + i);
-                v_uint8 t1 = vx_load(src1 + i);
-                vx_store(dst0 + i, t1);
-                vx_store(dst1 + i, t0);
-            }
-        }
-#endif
-#endif
-
-        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
-        {
-            for( ; i <= size.width - 16; i += 16 )
-            {
-                int t0 = ((int*)(src0 + i))[0];
-                int t1 = ((int*)(src1 + i))[0];
-
-                ((int*)(dst0 + i))[0] = t1;
-                ((int*)(dst1 + i))[0] = t0;
-
-                t0 = ((int*)(src0 + i))[1];
-                t1 = ((int*)(src1 + i))[1];
-
-                ((int*)(dst0 + i))[1] = t1;
-                ((int*)(dst1 + i))[1] = t0;
-
-                t0 = ((int*)(src0 + i))[2];
-                t1 = ((int*)(src1 + i))[2];
-
-                ((int*)(dst0 + i))[2] = t1;
-                ((int*)(dst1 + i))[2] = t0;
-
-                t0 = ((int*)(src0 + i))[3];
-                t1 = ((int*)(src1 + i))[3];
-
-                ((int*)(dst0 + i))[3] = t1;
-                ((int*)(dst1 + i))[3] = t0;
-            }
-
-            for( ; i <= size.width - 4; i += 4 )
-            {
-                int t0 = ((int*)(src0 + i))[0];
-                int t1 = ((int*)(src1 + i))[0];
-
-                ((int*)(dst0 + i))[0] = t1;
-                ((int*)(dst1 + i))[0] = t0;
-            }
-        }
-
-        for( ; i < size.width; i++ )
-        {
-            uchar t0 = src0[i];
-            uchar t1 = src1[i];
-
-            dst0[i] = t1;
-            dst1[i] = t0;
-        }
-    }
-}
-
-#ifdef HAVE_OPENCL
-
-enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
-
-static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
-{
-    CV_Assert(flipCode >= -1 && flipCode <= 1);
-
-    const ocl::Device & dev = ocl::Device::getDefault();
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
-
-    bool doubleSupport = dev.doubleFPConfig() > 0;
-    if (!doubleSupport && depth == CV_64F)
-        kercn = cn;
-
-    if (cn > 4)
-        return false;
-
-    const char * kernelName;
-    if (flipCode == 0)
-        kernelName = "arithm_flip_rows", flipType = FLIP_ROWS;
-    else if (flipCode > 0)
-        kernelName = "arithm_flip_cols", flipType = FLIP_COLS;
-    else
-        kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
-
-    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
-    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
-
-    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
-                kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
-                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
-    if (k.empty())
-        return false;
-
-    Size size = _src.size();
-    _dst.create(size, type);
-    UMat src = _src.getUMat(), dst = _dst.getUMat();
-
-    int cols = size.width * cn / kercn, rows = size.height;
-    cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols;
-    rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows;
-
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src),
-           ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols);
-
-    size_t maxWorkGroupSize = dev.maxWorkGroupSize();
-    CV_Assert(maxWorkGroupSize % 4 == 0);
-
-    size_t globalsize[2] = { (size_t)cols, ((size_t)rows + pxPerWIy - 1) / pxPerWIy },
-            localsize[2] = { maxWorkGroupSize / 4, 4 };
-    return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false);
-}
-
-#endif
-
-#if defined HAVE_IPP
-static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP();
-
-    // Details: https://github.com/opencv/opencv/issues/12943
-    if (flip_mode <= 0 /* swap rows */
-        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
-        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
-    )
-        return false;
-
-    IppiAxis ippMode;
-    if(flip_mode < 0)
-        ippMode = ippAxsBoth;
-    else if(flip_mode == 0)
-        ippMode = ippAxsHorizontal;
-    else
-        ippMode = ippAxsVertical;
-
-    try
-    {
-        ::ipp::IwiImage iwSrc = ippiGetImage(src);
-        ::ipp::IwiImage iwDst = ippiGetImage(dst);
-
-        CV_INSTRUMENT_FUN_IPP(::ipp::iwiMirror, iwSrc, iwDst, ippMode);
-    }
-    catch(const ::ipp::IwException &)
-    {
-        return false;
-    }
-
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(flip_mode);
-    return false;
-#endif
-}
-#endif
-
-
-void flip( InputArray _src, OutputArray _dst, int flip_mode )
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_Assert( _src.dims() <= 2 );
-    Size size = _src.size();
-
-    if (flip_mode < 0)
-    {
-        if (size.width == 1)
-            flip_mode = 0;
-        if (size.height == 1)
-            flip_mode = 1;
-    }
-
-    if ((size.width == 1 && flip_mode > 0) ||
-        (size.height == 1 && flip_mode == 0))
-    {
-        return _src.copyTo(_dst);
-    }
-
-    CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src, _dst, flip_mode))
-
-    Mat src = _src.getMat();
-    int type = src.type();
-    _dst.create( size, type );
-    Mat dst = _dst.getMat();
-
-    CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
-
-    size_t esz = CV_ELEM_SIZE(type);
-
-    if( flip_mode <= 0 )
-        flipVert( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
-    else
-        flipHoriz( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
-
-    if( flip_mode < 0 )
-        flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
-}
-
-void rotate(InputArray _src, OutputArray _dst, int rotateMode)
-{
-    CV_Assert(_src.dims() <= 2);
-
-    switch (rotateMode)
-    {
-    case ROTATE_90_CLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 1);
-        break;
-    case ROTATE_180:
-        flip(_src, _dst, -1);
-        break;
-    case ROTATE_90_COUNTERCLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 0);
-        break;
-    default:
-        break;
-    }
-}
 
 #if defined HAVE_OPENCL && !defined __APPLE__
 
@@ -1325,13 +907,12 @@ void copyMakeConstBorder_8u( const uchar* src, size_t srcstep, cv::Size srcroi,
         memcpy( dstInner + srcroi.width, constBuf, right );
     }
 
-    dst += dststep*top;
-
     for( i = 0; i < top; i++ )
-        memcpy(dst + (i - top)*dststep, constBuf, dstroi.width);
+        memcpy(dst + i * dststep, constBuf, dstroi.width);
 
+    dst += (top + srcroi.height) * dststep;
     for( i = 0; i < bottom; i++ )
-        memcpy(dst + (i + srcroi.height)*dststep, constBuf, dstroi.width);
+        memcpy(dst + i * dststep, constBuf, dstroi.width);
 }
 
 }
@@ -1500,6 +1081,9 @@ void cv::copyMakeBorder( InputArray _src, OutputArray _dst, int top, int bottom,
     }
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 /* dst = src */
 CV_IMPL void
 cvCopy( const void* srcarr, void* dstarr, const void* maskarr )
@@ -1606,4 +1190,5 @@ cvFlip( const CvArr* srcarr, CvArr* dstarr, int flip_mode )
     cv::flip( src, dst, flip_mode );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/count_non_zero.dispatch.cpp b/modules/core/src/count_non_zero.dispatch.cpp
index 089359d3e1..aac0c81293 100644
--- a/modules/core/src/count_non_zero.dispatch.cpp
+++ b/modules/core/src/count_non_zero.dispatch.cpp
@@ -62,11 +62,9 @@ static bool ipp_countNonZero( Mat &src, int &res )
 {
     CV_INSTRUMENT_REGION_IPP();
 
-#if defined __APPLE__ || (defined _MSC_VER && defined _M_IX86)
     // see https://github.com/opencv/opencv/issues/17453
-    if (src.dims <= 2 && src.step > 520000)
+    if (src.dims <= 2 && src.step > 520000 && cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
         return false;
-#endif
 
 #if IPP_VERSION_X100 < 201801
     // Poor performance of SSE42
diff --git a/modules/core/src/cuda/gpu_mat_nd.cu b/modules/core/src/cuda/gpu_mat_nd.cu
new file mode 100644
index 0000000000..3f51fd8afa
--- /dev/null
+++ b/modules/core/src/cuda/gpu_mat_nd.cu
@@ -0,0 +1,269 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+GpuData::GpuData(const size_t _size)
+    : data(nullptr), size(_size)
+{
+    CV_CUDEV_SAFE_CALL(cudaMalloc(&data, _size));
+}
+
+GpuData::~GpuData()
+{
+    CV_CUDEV_SAFE_CALL(cudaFree(data));
+}
+
+/////////////////////////////////////////////////////
+/// create
+
+void GpuMatND::create(SizeArray _size, int _type)
+{
+    {
+        auto elements_nonzero = [](SizeArray& v)
+        {
+            return std::all_of(v.begin(), v.end(),
+                [](unsigned u){ return u > 0; });
+        };
+        CV_Assert(!_size.empty());
+        CV_Assert(elements_nonzero(_size));
+    }
+
+    _type &= Mat::TYPE_MASK;
+
+    if (size == _size && type() == _type && !empty() && !external() && isContinuous() && !isSubmatrix())
+        return;
+
+    release();
+
+    setFields(std::move(_size), _type);
+
+    data_ = std::make_shared<GpuData>(totalMemSize());
+    data = data_->data;
+    offset = 0;
+}
+
+/////////////////////////////////////////////////////
+/// release
+
+void GpuMatND::release()
+{
+    data = nullptr;
+    data_.reset();
+
+    flags = dims = offset = 0;
+    size.clear();
+    step.clear();
+}
+
+/////////////////////////////////////////////////////
+/// clone
+
+static bool next(uchar*& d, const uchar*& s, std::vector<int>& idx, const int dims, const GpuMatND& dst, const GpuMatND& src)
+{
+    int inc = dims-3;
+
+    while (true)
+    {
+        if (idx[inc] == src.size[inc] - 1)
+        {
+            if (inc == 0)
+            {
+                return false;
+            }
+
+            idx[inc] = 0;
+            d -= (dst.size[inc] - 1) * dst.step[inc];
+            s -= (src.size[inc] - 1) * src.step[inc];
+            inc--;
+        }
+        else
+        {
+            idx[inc]++;
+            d += dst.step[inc];
+            s += src.step[inc];
+            break;
+        }
+    }
+
+    return true;
+}
+
+GpuMatND GpuMatND::clone() const
+{
+    CV_DbgAssert(!empty());
+
+    GpuMatND ret(size, type());
+
+    if (isContinuous())
+    {
+        CV_CUDEV_SAFE_CALL(cudaMemcpy(ret.getDevicePtr(), getDevicePtr(), ret.totalMemSize(), cudaMemcpyDeviceToDevice));
+    }
+    else
+    {
+        // 1D arrays are always continuous
+
+        if (dims == 2)
+        {
+            CV_CUDEV_SAFE_CALL(
+                cudaMemcpy2D(ret.getDevicePtr(), ret.step[0], getDevicePtr(), step[0],
+                    size[1]*step[1], size[0], cudaMemcpyDeviceToDevice)
+            );
+        }
+        else
+        {
+            std::vector<int> idx(dims-2, 0);
+
+            uchar* d = ret.getDevicePtr();
+            const uchar* s = getDevicePtr();
+
+            // iterate each 2D plane
+            do
+            {
+                CV_CUDEV_SAFE_CALL(
+                    cudaMemcpy2DAsync(
+                        d, ret.step[dims-2], s, step[dims-2],
+                        size[dims-1]*step[dims-1], size[dims-2], cudaMemcpyDeviceToDevice)
+                );
+            }
+            while (next(d, s, idx, dims, ret, *this));
+
+            CV_CUDEV_SAFE_CALL(cudaStreamSynchronize(0));
+        }
+    }
+
+    return ret;
+}
+
+GpuMatND GpuMatND::clone(Stream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    GpuMatND ret(size, type());
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+
+    if (isContinuous())
+    {
+        CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(ret.getDevicePtr(), getDevicePtr(), ret.totalMemSize(), cudaMemcpyDeviceToDevice, _stream));
+    }
+    else
+    {
+        // 1D arrays are always continuous
+
+        if (dims == 2)
+        {
+            CV_CUDEV_SAFE_CALL(
+                cudaMemcpy2DAsync(ret.getDevicePtr(), ret.step[0], getDevicePtr(), step[0],
+                    size[1]*step[1], size[0], cudaMemcpyDeviceToDevice, _stream)
+            );
+        }
+        else
+        {
+            std::vector<int> idx(dims-2, 0);
+
+            uchar* d = ret.getDevicePtr();
+            const uchar* s = getDevicePtr();
+
+            // iterate each 2D plane
+            do
+            {
+                CV_CUDEV_SAFE_CALL(
+                    cudaMemcpy2DAsync(
+                        d, ret.step[dims-2], s, step[dims-2],
+                        size[dims-1]*step[dims-1], size[dims-2], cudaMemcpyDeviceToDevice, _stream)
+                );
+            }
+            while (next(d, s, idx, dims, ret, *this));
+        }
+    }
+
+    return ret;
+}
+
+/////////////////////////////////////////////////////
+/// upload
+
+void GpuMatND::upload(InputArray src)
+{
+    Mat mat = src.getMat();
+
+    CV_DbgAssert(!mat.empty());
+
+    if (!mat.isContinuous())
+        mat = mat.clone();
+
+    SizeArray _size(mat.dims);
+    std::copy_n(mat.size.p, mat.dims, _size.data());
+
+    create(std::move(_size), mat.type());
+
+    CV_CUDEV_SAFE_CALL(cudaMemcpy(getDevicePtr(), mat.data, totalMemSize(), cudaMemcpyHostToDevice));
+}
+
+void GpuMatND::upload(InputArray src, Stream& stream)
+{
+    Mat mat = src.getMat();
+
+    CV_DbgAssert(!mat.empty());
+
+    if (!mat.isContinuous())
+        mat = mat.clone();
+
+    SizeArray _size(mat.dims);
+    std::copy_n(mat.size.p, mat.dims, _size.data());
+
+    create(std::move(_size), mat.type());
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+    CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(getDevicePtr(), mat.data, totalMemSize(), cudaMemcpyHostToDevice, _stream));
+}
+
+/////////////////////////////////////////////////////
+/// download
+
+void GpuMatND::download(OutputArray dst) const
+{
+    CV_DbgAssert(!empty());
+
+    dst.create(dims, size.data(), type());
+    Mat mat = dst.getMat();
+
+    GpuMatND gmat = *this;
+
+    if (!gmat.isContinuous())
+        gmat = gmat.clone();
+
+    CV_CUDEV_SAFE_CALL(cudaMemcpy(mat.data, gmat.getDevicePtr(), mat.total() * mat.elemSize(), cudaMemcpyDeviceToHost));
+}
+
+void GpuMatND::download(OutputArray dst, Stream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    dst.create(dims, size.data(), type());
+    Mat mat = dst.getMat();
+
+    GpuMatND gmat = *this;
+
+    if (!gmat.isContinuous())
+        gmat = gmat.clone(stream);
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+    CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(mat.data, gmat.getDevicePtr(), mat.total() * mat.elemSize(), cudaMemcpyDeviceToHost, _stream));
+}
+
+#endif
diff --git a/modules/core/src/cuda_gpu_mat_nd.cpp b/modules/core/src/cuda_gpu_mat_nd.cpp
new file mode 100644
index 0000000000..8440f179ea
--- /dev/null
+++ b/modules/core/src/cuda_gpu_mat_nd.cpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+GpuMatND::~GpuMatND() = default;
+
+GpuMatND::GpuMatND(SizeArray _size, int _type, void* _data, StepArray _step) :
+    flags(0), dims(0), data(static_cast<uchar*>(_data)), offset(0)
+{
+    CV_Assert(_step.empty() || _size.size() == _step.size() + 1);
+
+    setFields(std::move(_size), _type, std::move(_step));
+}
+
+GpuMatND GpuMatND::operator()(const std::vector<Range>& ranges) const
+{
+    CV_Assert(dims == (int)ranges.size());
+
+    for (int i = 0; i < dims; ++i)
+    {
+        Range r = ranges[i];
+        CV_Assert(r == Range::all() || (0 <= r.start && r.start < r.end && r.end <= size[i]));
+    }
+
+    GpuMatND ret = *this;
+
+    for (int i = 0; i < dims; ++i)
+    {
+        Range r = ranges[i];
+        if (r != Range::all() && r != Range(0, ret.size[i]))
+        {
+            ret.offset += r.start * ret.step[i];
+            ret.size[i] = r.size();
+            ret.flags |= Mat::SUBMATRIX_FLAG;
+        }
+    }
+
+    ret.flags = cv::updateContinuityFlag(ret.flags, dims, ret.size.data(), ret.step.data());
+
+    return ret;
+}
+
+GpuMat GpuMatND::createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const
+{
+    CV_Assert((int)idx.size() == dims - 2);
+
+    std::vector<Range> ranges;
+    for (int i : idx)
+        ranges.emplace_back(i, i+1);
+    ranges.push_back(rowRange);
+    ranges.push_back(colRange);
+
+    return (*this)(ranges).createGpuMatHeader();
+}
+
+GpuMat GpuMatND::createGpuMatHeader() const
+{
+    auto Effectively2D = [](GpuMatND m)
+    {
+        for (int i = 0; i < m.dims - 2; ++i)
+            if (m.size[i] > 1)
+                return false;
+        return true;
+    };
+    CV_Assert(Effectively2D(*this));
+
+    return GpuMat(size[dims-2], size[dims-1], type(), getDevicePtr(), step[dims-2]);
+}
+
+GpuMat GpuMatND::operator()(IndexArray idx, Range rowRange, Range colRange) const
+{
+    return createGpuMatHeader(idx, rowRange, colRange).clone();
+}
+
+GpuMatND::operator GpuMat() const
+{
+    return createGpuMatHeader().clone();
+}
+
+void GpuMatND::setFields(SizeArray _size, int _type, StepArray _step)
+{
+    _type &= Mat::TYPE_MASK;
+
+    flags = Mat::MAGIC_VAL + _type;
+    dims = static_cast<int>(_size.size());
+    size = std::move(_size);
+
+    if (_step.empty())
+    {
+        step = StepArray(dims);
+
+        step.back() = elemSize();
+        for (int _i = dims - 2; _i >= 0; --_i)
+        {
+            const size_t i = _i;
+            step[i] = step[i+1] * size[i+1];
+        }
+
+        flags |= Mat::CONTINUOUS_FLAG;
+    }
+    else
+    {
+        step = std::move(_step);
+        step.push_back(elemSize());
+
+        flags = cv::updateContinuityFlag(flags, dims, size.data(), step.data());
+    }
+
+    CV_Assert(size.size() == step.size());
+    CV_Assert(step.back() == elemSize());
+}
+
+#ifndef HAVE_CUDA
+
+GpuData::GpuData(const size_t _size)
+    : data(nullptr), size(0)
+{
+    CV_UNUSED(_size);
+    throw_no_cuda();
+}
+
+GpuData::~GpuData()
+{
+}
+
+void GpuMatND::create(SizeArray _size, int _type)
+{
+    CV_UNUSED(_size);
+    CV_UNUSED(_type);
+    throw_no_cuda();
+}
+
+void GpuMatND::release()
+{
+    throw_no_cuda();
+}
+
+GpuMatND GpuMatND::clone() const
+{
+    throw_no_cuda();
+}
+
+GpuMatND GpuMatND::clone(Stream& stream) const
+{
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+void GpuMatND::upload(InputArray src)
+{
+    CV_UNUSED(src);
+    throw_no_cuda();
+}
+
+void GpuMatND::upload(InputArray src, Stream& stream)
+{
+    CV_UNUSED(src);
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+void GpuMatND::download(OutputArray dst) const
+{
+    CV_UNUSED(dst);
+    throw_no_cuda();
+}
+
+void GpuMatND::download(OutputArray dst, Stream& stream) const
+{
+    CV_UNUSED(dst);
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+#endif
diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp
index 5fb873a369..3680e0720a 100644
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include <cstdint>
 
 using namespace cv;
 using namespace cv::cuda;
@@ -293,6 +294,7 @@ public:
 
     Impl();
     Impl(const Ptr<GpuMat::Allocator>& allocator);
+    Impl(const unsigned int cudaFlags);
     explicit Impl(cudaStream_t stream);
 
     ~Impl();
@@ -312,6 +314,13 @@ cv::cuda::Stream::Impl::Impl(const Ptr<GpuMat::Allocator>& allocator) : stream(0
     ownStream = true;
 }
 
+cv::cuda::Stream::Impl::Impl(const unsigned int cudaFlags) : stream(0), ownStream(false)
+{
+    cudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaFlags));
+    ownStream = true;
+    allocator = makePtr<StackAllocator>(stream);
+}
+
 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
 {
     allocator = makePtr<StackAllocator>(stream);
@@ -450,6 +459,16 @@ cv::cuda::Stream::Stream(const Ptr<GpuMat::Allocator>& allocator)
 #endif
 }
 
+cv::cuda::Stream::Stream(const size_t cudaFlags)
+{
+#ifndef HAVE_CUDA
+    CV_UNUSED(cudaFlags);
+    throw_no_cuda();
+#else
+    impl_ = makePtr<Impl>(cudaFlags & UINT_MAX);
+#endif
+}
+
 bool cv::cuda::Stream::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index c00266dd55..1ff6fa9178 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -40,6 +40,8 @@
 //M*/
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 /* default alignment for dynamic data strucutures, resided in storages. */
 #define  CV_STRUCT_ALIGN    ((int)sizeof(double))
 
@@ -3585,4 +3587,5 @@ void  seqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr )
 
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/directx.cpp b/modules/core/src/directx.cpp
index f028702d7f..0173f02916 100644
--- a/modules/core/src/directx.cpp
+++ b/modules/core/src/directx.cpp
@@ -1050,7 +1050,7 @@ bool ocl_convert_nv12_to_bgr(
 
     k.args(clImageY, clImageUV, clBuffer, step, cols, rows);
 
-    size_t globalsize[] = { (size_t)cols, (size_t)rows };
+    size_t globalsize[] = { (size_t)cols/2, (size_t)rows/2 };
     return k.run(2, globalsize, 0, false);
 }
 
@@ -1071,7 +1071,7 @@ bool ocl_convert_bgr_to_nv12(
 
     k.args(clBuffer, step, cols, rows, clImageY, clImageUV);
 
-    size_t globalsize[] = { (size_t)cols, (size_t)rows };
+    size_t globalsize[] = { (size_t)cols/2, (size_t)rows/2 };
     return k.run(2, globalsize, 0, false);
 }
 
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index fcdb2a202f..87873666d9 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -4640,6 +4640,9 @@ int cv::getOptimalDFTSize( int size0 )
     return optimalDFTSizeTab[b];
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL void
 cvDFT( const CvArr* srcarr, CvArr* dstarr, int flags, int nonzero_rows )
 {
@@ -4695,4 +4698,5 @@ cvGetOptimalDFTSize( int size0 )
     return cv::getOptimalDFTSize(size0);
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp
index a31d1aa672..7ed15bdd8e 100644
--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -42,6 +42,7 @@
 //
 //M*/
 
+#include "precomp.hpp"
 #include "hal_internal.hpp"
 
 #ifdef HAVE_LAPACK
diff --git a/modules/core/src/hal_internal.hpp b/modules/core/src/hal_internal.hpp
index 129a710145..c7a0d46de4 100644
--- a/modules/core/src/hal_internal.hpp
+++ b/modules/core/src/hal_internal.hpp
@@ -45,8 +45,6 @@
 #ifndef OPENCV_CORE_HAL_INTERNAL_HPP
 #define OPENCV_CORE_HAL_INTERNAL_HPP
 
-#include "precomp.hpp"
-
 #ifdef HAVE_LAPACK
 
 int lapack_LU32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, int* info);
diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp
index 729b43f604..fbd567b949 100644
--- a/modules/core/src/intel_gpu_gemm.inl.hpp
+++ b/modules/core/src/intel_gpu_gemm.inl.hpp
@@ -25,7 +25,6 @@
 #ifdef HAVE_OPENCL
 
 #include <sstream>
-#include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 486b7a5aba..9bca6a8211 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -753,8 +753,6 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
                 (double*)alignPtr(buffer, sizeof(double)), DBL_EPSILON*2 );
 }
 
-}
-
 /****************************************************************************************\
 *                                 Determinant of the matrix                              *
 \****************************************************************************************/
@@ -764,7 +762,7 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
                    m(0,1)*((double)m(1,0)*m(2,2) - (double)m(1,2)*m(2,0)) +  \
                    m(0,2)*((double)m(1,0)*m(2,1) - (double)m(1,1)*m(2,0)))
 
-double cv::determinant( InputArray _mat )
+double determinant( InputArray _mat )
 {
     CV_INSTRUMENT_REGION();
 
@@ -842,7 +840,7 @@ double cv::determinant( InputArray _mat )
 #define Df( y, x ) ((float*)(dstdata + y*dststep))[x]
 #define Dd( y, x ) ((double*)(dstdata + y*dststep))[x]
 
-double cv::invert( InputArray _src, OutputArray _dst, int method )
+double invert( InputArray _src, OutputArray _dst, int method )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1069,13 +1067,19 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
     return result;
 }
 
+UMat UMat::inv(int method) const
+{
+    UMat m;
+    invert(*this, m, method);
+    return m;
+}
 
 
 /****************************************************************************************\
 *                              Solving a linear system                                   *
 \****************************************************************************************/
 
-bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
+bool solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1374,7 +1378,7 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
 
 /////////////////// finding eigenvalues and eigenvectors of a symmetric matrix ///////////////
 
-bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
+bool eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1396,7 +1400,7 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
     const bool evecNeeded = _evects.needed();
     const int esOptions = evecNeeded ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly;
     _evals.create(n, 1, type);
-    cv::Mat evals = _evals.getMat();
+    Mat evals = _evals.getMat();
     if ( type == CV_64F )
     {
         Eigen::MatrixXd src_eig, zeros_eig;
@@ -1448,9 +1452,6 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
 #endif
 }
 
-namespace cv
-{
-
 static void _SVDcompute( InputArray _aarr, OutputArray _w,
                          OutputArray _u, OutputArray _vt, int flags )
 {
@@ -1598,6 +1599,9 @@ void cv::SVBackSubst(InputArray w, InputArray u, InputArray vt, InputArray rhs,
 }
 
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL double
 cvDet( const CvArr* arr )
 {
@@ -1789,3 +1793,4 @@ cvSVBkSb( const CvArr* warr, const CvArr* uarr,
     cv::SVD::backSubst(w, u, v, rhs, dst);
     CV_Assert( dst.data == dst0.data );
 }
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index d4f8dc0ba4..f968bec02f 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -1638,6 +1638,9 @@ void patchNaNs( InputOutputArray _a, double _val )
 
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL void cvExp( const CvArr* srcarr, CvArr* dstarr )
 {
     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
@@ -1660,6 +1663,7 @@ CV_IMPL int cvCheckArr( const CvArr* arr, int flags,
     return cv::checkRange(cv::cvarrToMat(arr), (flags & CV_CHECK_QUIET) != 0, 0, minVal, maxVal );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 
 /*
   Finds real roots of cubic, quadratic or linear equation.
@@ -1955,6 +1959,8 @@ double cv::solvePoly( InputArray _coeffs0, OutputArray _roots0, int maxIters )
 }
 
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 void cvSolvePoly(const CvMat* a, CvMat *r, int maxiter, int)
 {
     cv::Mat _a = cv::cvarrToMat(a);
@@ -1964,6 +1970,7 @@ void cvSolvePoly(const CvMat* a, CvMat *r, int maxiter, int)
     CV_Assert( _r.data == _r0.data ); // check that the array of roots was not reallocated
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 
 
 // Common constants for dispatched code
diff --git a/modules/core/src/mathfuncs_core.dispatch.cpp b/modules/core/src/mathfuncs_core.dispatch.cpp
index e48f84ebbe..3c53ab1c38 100644
--- a/modules/core/src/mathfuncs_core.dispatch.cpp
+++ b/modules/core/src/mathfuncs_core.dispatch.cpp
@@ -7,6 +7,10 @@
 #include "mathfuncs_core.simd.hpp"
 #include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+
+#define IPP_DISABLE_MAGNITUDE_32F 1  // accuracy: https://github.com/opencv/opencv/issues/19506
+
+
 namespace cv { namespace hal {
 
 ///////////////////////////////////// ATAN2 ////////////////////////////////////
@@ -44,8 +48,25 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
+
+#ifdef HAVE_IPP
+    bool allowIPP = true;
+#ifdef IPP_DISABLE_MAGNITUDE_32F
+    if (cv::ipp::getIppTopFeatures() & (
+#if IPP_VERSION_X100 >= 201700
+            ippCPUID_AVX512F |
+#endif
+            ippCPUID_AVX2)
+    )
+    {
+        allowIPP = (len & 7) == 0;
+    }
+#endif
+
     // SSE42 performance issues
-    CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
+    CV_IPP_RUN((IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42) && allowIPP,
+        CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
+#endif
 
     CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len),
         CV_CPU_DISPATCH_MODES_ALL);
diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp
index a9b82aee88..f4bd14b5dd 100644
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@@ -999,8 +999,79 @@ double Mat::dot(InputArray _mat) const
     return r;
 }
 
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
+{
+    UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
+
+    int type = src1.type(), depth = CV_MAT_DEPTH(type),
+            kercn = ocl::predictOptimalVectorWidth(src1, src2);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ( !doubleSupport && depth == CV_64F )
+        return false;
+
+    int dbsize = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int ddepth = std::max(CV_32F, depth);
+
+    int wgs2_aligned = 1;
+    while (wgs2_aligned < (int)wgs)
+        wgs2_aligned <<= 1;
+    wgs2_aligned >>= 1;
+
+    char cvt[40];
+    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
+                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
+                         "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
+                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
+                         ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
+                         (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
+                         _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
+    if (k.empty())
+        return false;
+
+    UMat db(1, dbsize, ddepth);
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dbarg = ocl::KernelArg::PtrWriteOnly(db);
+
+    k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
+
+    size_t globalsize = dbsize * wgs;
+    if (k.run(1, &globalsize, &wgs, true))
+    {
+        res = sum(db.getMat(ACCESS_READ))[0];
+        return true;
+    }
+    return false;
+}
+
+#endif
+
+double UMat::dot(InputArray m) const
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(m.sameSize(*this) && m.type() == type());
+
+#ifdef HAVE_OPENCL
+    double r = 0;
+    CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
+#endif
+
+    return getMat(ACCESS_READ).dot(m);
+}
+
 }  // namespace cv::
 
+
+#ifndef OPENCV_EXCLUDE_C_API
 /****************************************************************************************\
 *                                    Earlier API                                         *
 \****************************************************************************************/
@@ -1225,4 +1296,6 @@ cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
     CV_Assert(dst0.data == dst.data);
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
+
 /* End of file. */
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 38973ea1a4..c828e2906d 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
+#if CV_SIMD
     if( scn == 3 && dcn == 3 )
     {
         int x = 0;
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 122b383379..61abc2ba8f 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -204,7 +204,7 @@ MatAllocator* Mat::getStdAllocator()
 
 //==================================================================================================
 
-bool MatSize::operator==(const MatSize& sz) const
+bool MatSize::operator==(const MatSize& sz) const CV_NOEXCEPT
 {
     int d = dims();
     int dsz = sz.dims();
@@ -337,7 +337,7 @@ void finalizeHdr(Mat& m)
 
 //======================================= Mat ======================================================
 
-Mat::Mat()
+Mat::Mat() CV_NOEXCEPT
     : flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
       datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {}
diff --git a/modules/core/src/matrix_c.cpp b/modules/core/src/matrix_c.cpp
index dc935c3eca..e15ea9fdac 100644
--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@@ -1,7 +1,12 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types_c.h"
-#include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
 // glue
 
 CvMatND cvMatND(const cv::Mat& m)
@@ -342,3 +347,5 @@ cvSort( const CvArr* _src, CvArr* _dst, CvArr* _idx, int flags )
         CV_Assert( dst0.data == dst.data );
     }
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/matrix_iterator.cpp b/modules/core/src/matrix_iterator.cpp
index aaa7f4aa01..ce7c191cbe 100644
--- a/modules/core/src/matrix_iterator.cpp
+++ b/modules/core/src/matrix_iterator.cpp
@@ -2,9 +2,8 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
-#include "opencv2/core/mat.hpp"
 #include "precomp.hpp"
+#include "opencv2/core/mat.hpp"
 
 namespace cv {
 
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index ac94ecee7d..83c8aaeb57 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -2,11 +2,10 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
+#include "precomp.hpp"
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types_c.h"
 #include "opencl_kernels_core.hpp"
-#include "precomp.hpp"
 
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
@@ -227,6 +226,23 @@ void cv::setIdentity( InputOutputArray _m, const Scalar& s )
     }
 }
 
+
+namespace cv {
+
+UMat UMat::eye(int rows, int cols, int type)
+{
+    return UMat::eye(Size(cols, rows), type);
+}
+
+UMat UMat::eye(Size size, int type)
+{
+    UMat m(size, type);
+    setIdentity(m);
+    return m;
+}
+
+}  // namespace
+
 //////////////////////////////////////////// trace ///////////////////////////////////////////
 
 cv::Scalar cv::trace( InputArray _m )
@@ -261,285 +277,6 @@ cv::Scalar cv::trace( InputArray _m )
     return cv::sum(m.diag());
 }
 
-////////////////////////////////////// transpose /////////////////////////////////////////
-
-namespace cv
-{
-
-template<typename T> static void
-transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
-{
-    int i=0, j, m = sz.width, n = sz.height;
-
-    #if CV_ENABLE_UNROLLED
-    for(; i <= m - 4; i += 4 )
-    {
-        T* d0 = (T*)(dst + dstep*i);
-        T* d1 = (T*)(dst + dstep*(i+1));
-        T* d2 = (T*)(dst + dstep*(i+2));
-        T* d3 = (T*)(dst + dstep*(i+3));
-
-        for( j = 0; j <= n - 4; j += 4 )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
-            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
-            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
-            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
-
-            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
-            d1[j] = s0[1]; d1[j+1] = s1[1]; d1[j+2] = s2[1]; d1[j+3] = s3[1];
-            d2[j] = s0[2]; d2[j+1] = s1[2]; d2[j+2] = s2[2]; d2[j+3] = s3[2];
-            d3[j] = s0[3]; d3[j+1] = s1[3]; d3[j+2] = s2[3]; d3[j+3] = s3[3];
-        }
-
-        for( ; j < n; j++ )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
-            d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
-        }
-    }
-    #endif
-    for( ; i < m; i++ )
-    {
-        T* d0 = (T*)(dst + dstep*i);
-        j = 0;
-        #if CV_ENABLE_UNROLLED
-        for(; j <= n - 4; j += 4 )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
-            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
-            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
-            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
-
-            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
-        }
-        #endif
-        for( ; j < n; j++ )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
-            d0[j] = s0[0];
-        }
-    }
-}
-
-template<typename T> static void
-transposeI_( uchar* data, size_t step, int n )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        T* row = (T*)(data + step*i);
-        uchar* data1 = data + i*sizeof(T);
-        for( int j = i+1; j < n; j++ )
-            std::swap( row[j], *(T*)(data1 + step*j) );
-    }
-}
-
-typedef void (*TransposeFunc)( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz );
-typedef void (*TransposeInplaceFunc)( uchar* data, size_t step, int n );
-
-#define DEF_TRANSPOSE_FUNC(suffix, type) \
-static void transpose_##suffix( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) \
-{ transpose_<type>(src, sstep, dst, dstep, sz); } \
-\
-static void transposeI_##suffix( uchar* data, size_t step, int n ) \
-{ transposeI_<type>(data, step, n); }
-
-DEF_TRANSPOSE_FUNC(8u, uchar)
-DEF_TRANSPOSE_FUNC(16u, ushort)
-DEF_TRANSPOSE_FUNC(8uC3, Vec3b)
-DEF_TRANSPOSE_FUNC(32s, int)
-DEF_TRANSPOSE_FUNC(16uC3, Vec3s)
-DEF_TRANSPOSE_FUNC(32sC2, Vec2i)
-DEF_TRANSPOSE_FUNC(32sC3, Vec3i)
-DEF_TRANSPOSE_FUNC(32sC4, Vec4i)
-DEF_TRANSPOSE_FUNC(32sC6, Vec6i)
-DEF_TRANSPOSE_FUNC(32sC8, Vec8i)
-
-static TransposeFunc transposeTab[] =
-{
-    0, transpose_8u, transpose_16u, transpose_8uC3, transpose_32s, 0, transpose_16uC3, 0,
-    transpose_32sC2, 0, 0, 0, transpose_32sC3, 0, 0, 0, transpose_32sC4,
-    0, 0, 0, 0, 0, 0, 0, transpose_32sC6, 0, 0, 0, 0, 0, 0, 0, transpose_32sC8
-};
-
-static TransposeInplaceFunc transposeInplaceTab[] =
-{
-    0, transposeI_8u, transposeI_16u, transposeI_8uC3, transposeI_32s, 0, transposeI_16uC3, 0,
-    transposeI_32sC2, 0, 0, 0, transposeI_32sC3, 0, 0, 0, transposeI_32sC4,
-    0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
-};
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_transpose( InputArray _src, OutputArray _dst )
-{
-    const ocl::Device & dev = ocl::Device::getDefault();
-    const int TILE_DIM = 32, BLOCK_ROWS = 8;
-    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
-        rowsPerWI = dev.isIntel() ? 4 : 1;
-
-    UMat src = _src.getUMat();
-    _dst.create(src.cols, src.rows, type);
-    UMat dst = _dst.getUMat();
-
-    String kernelName("transpose");
-    bool inplace = dst.u == src.u;
-
-    if (inplace)
-    {
-        CV_Assert(dst.cols == dst.rows);
-        kernelName += "_inplace";
-    }
-    else
-    {
-        // check required local memory size
-        size_t required_local_memory = (size_t) TILE_DIM*(TILE_DIM+1)*CV_ELEM_SIZE(type);
-        if (required_local_memory > ocl::Device::getDefault().localMemSize())
-            return false;
-    }
-
-    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
-                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d%s",
-                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
-                         cn, TILE_DIM, BLOCK_ROWS, rowsPerWI, inplace ? " -D INPLACE" : ""));
-    if (k.empty())
-        return false;
-
-    if (inplace)
-        k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
-    else
-        k.args(ocl::KernelArg::ReadOnly(src),
-               ocl::KernelArg::WriteOnlyNoSize(dst));
-
-    size_t localsize[2]  = { TILE_DIM, BLOCK_ROWS };
-    size_t globalsize[2] = { (size_t)src.cols, inplace ? ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI : (divUp((size_t)src.rows, TILE_DIM) * BLOCK_ROWS) };
-
-    if (inplace && dev.isIntel())
-    {
-        localsize[0] = 16;
-        localsize[1] = dev.maxWorkGroupSize() / localsize[0];
-    }
-
-    return k.run(2, globalsize, localsize, false);
-}
-
-#endif
-
-#ifdef HAVE_IPP
-static bool ipp_transpose( Mat &src, Mat &dst )
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-    int type = src.type();
-    typedef IppStatus (CV_STDCALL * IppiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
-    typedef IppStatus (CV_STDCALL * IppiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
-    IppiTranspose ippiTranspose = 0;
-    IppiTransposeI ippiTranspose_I = 0;
-
-    if (dst.data == src.data && dst.cols == dst.rows)
-    {
-        CV_SUPPRESS_DEPRECATED_START
-        ippiTranspose_I =
-            type == CV_8UC1 ? (IppiTransposeI)ippiTranspose_8u_C1IR :
-            type == CV_8UC3 ? (IppiTransposeI)ippiTranspose_8u_C3IR :
-            type == CV_8UC4 ? (IppiTransposeI)ippiTranspose_8u_C4IR :
-            type == CV_16UC1 ? (IppiTransposeI)ippiTranspose_16u_C1IR :
-            type == CV_16UC3 ? (IppiTransposeI)ippiTranspose_16u_C3IR :
-            type == CV_16UC4 ? (IppiTransposeI)ippiTranspose_16u_C4IR :
-            type == CV_16SC1 ? (IppiTransposeI)ippiTranspose_16s_C1IR :
-            type == CV_16SC3 ? (IppiTransposeI)ippiTranspose_16s_C3IR :
-            type == CV_16SC4 ? (IppiTransposeI)ippiTranspose_16s_C4IR :
-            type == CV_32SC1 ? (IppiTransposeI)ippiTranspose_32s_C1IR :
-            type == CV_32SC3 ? (IppiTransposeI)ippiTranspose_32s_C3IR :
-            type == CV_32SC4 ? (IppiTransposeI)ippiTranspose_32s_C4IR :
-            type == CV_32FC1 ? (IppiTransposeI)ippiTranspose_32f_C1IR :
-            type == CV_32FC3 ? (IppiTransposeI)ippiTranspose_32f_C3IR :
-            type == CV_32FC4 ? (IppiTransposeI)ippiTranspose_32f_C4IR : 0;
-        CV_SUPPRESS_DEPRECATED_END
-    }
-    else
-    {
-        ippiTranspose =
-            type == CV_8UC1 ? (IppiTranspose)ippiTranspose_8u_C1R :
-            type == CV_8UC3 ? (IppiTranspose)ippiTranspose_8u_C3R :
-            type == CV_8UC4 ? (IppiTranspose)ippiTranspose_8u_C4R :
-            type == CV_16UC1 ? (IppiTranspose)ippiTranspose_16u_C1R :
-            type == CV_16UC3 ? (IppiTranspose)ippiTranspose_16u_C3R :
-            type == CV_16UC4 ? (IppiTranspose)ippiTranspose_16u_C4R :
-            type == CV_16SC1 ? (IppiTranspose)ippiTranspose_16s_C1R :
-            type == CV_16SC3 ? (IppiTranspose)ippiTranspose_16s_C3R :
-            type == CV_16SC4 ? (IppiTranspose)ippiTranspose_16s_C4R :
-            type == CV_32SC1 ? (IppiTranspose)ippiTranspose_32s_C1R :
-            type == CV_32SC3 ? (IppiTranspose)ippiTranspose_32s_C3R :
-            type == CV_32SC4 ? (IppiTranspose)ippiTranspose_32s_C4R :
-            type == CV_32FC1 ? (IppiTranspose)ippiTranspose_32f_C1R :
-            type == CV_32FC3 ? (IppiTranspose)ippiTranspose_32f_C3R :
-            type == CV_32FC4 ? (IppiTranspose)ippiTranspose_32f_C4R : 0;
-    }
-
-    IppiSize roiSize = { src.cols, src.rows };
-    if (ippiTranspose != 0)
-    {
-        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, roiSize) >= 0)
-            return true;
-    }
-    else if (ippiTranspose_I != 0)
-    {
-        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose_I, dst.ptr(), (int)dst.step, roiSize) >= 0)
-            return true;
-    }
-    return false;
-}
-#endif
-
-}
-
-
-void cv::transpose( InputArray _src, OutputArray _dst )
-{
-    CV_INSTRUMENT_REGION();
-
-    int type = _src.type(), esz = CV_ELEM_SIZE(type);
-    CV_Assert( _src.dims() <= 2 && esz <= 32 );
-
-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_transpose(_src, _dst))
-
-    Mat src = _src.getMat();
-    if( src.empty() )
-    {
-        _dst.release();
-        return;
-    }
-
-    _dst.create(src.cols, src.rows, src.type());
-    Mat dst = _dst.getMat();
-
-    // handle the case of single-column/single-row matrices, stored in STL vectors.
-    if( src.rows != dst.cols || src.cols != dst.rows )
-    {
-        CV_Assert( src.size() == dst.size() && (src.cols == 1 || src.rows == 1) );
-        src.copyTo(dst);
-        return;
-    }
-
-    CV_IPP_RUN_FAST(ipp_transpose(src, dst))
-
-    if( dst.data == src.data )
-    {
-        TransposeInplaceFunc func = transposeInplaceTab[esz];
-        CV_Assert( func != 0 );
-        CV_Assert( dst.cols == dst.rows );
-        func( dst.ptr(), dst.step, dst.rows );
-    }
-    else
-    {
-        TransposeFunc func = transposeTab[esz];
-        CV_Assert( func != 0 );
-        func( src.ptr(), src.step, dst.ptr(), dst.step, src.size() );
-    }
-}
-
 
 ////////////////////////////////////// completeSymm /////////////////////////////////////////
 
diff --git a/modules/core/src/matrix_sparse.cpp b/modules/core/src/matrix_sparse.cpp
index 05d16d706e..21e7e91151 100644
--- a/modules/core/src/matrix_sparse.cpp
+++ b/modules/core/src/matrix_sparse.cpp
@@ -2,10 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
+#include "precomp.hpp"
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types_c.h"
-#include "precomp.hpp"
 
 namespace cv {
 
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
new file mode 100644
index 0000000000..727eaf7fee
--- /dev/null
+++ b/modules/core/src/matrix_transform.cpp
@@ -0,0 +1,770 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "opencl_kernels_core.hpp"
+
+namespace cv {
+
+////////////////////////////////////// transpose /////////////////////////////////////////
+
+template<typename T> static void
+transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
+{
+    int i=0, j, m = sz.width, n = sz.height;
+
+    #if CV_ENABLE_UNROLLED
+    for(; i <= m - 4; i += 4 )
+    {
+        T* d0 = (T*)(dst + dstep*i);
+        T* d1 = (T*)(dst + dstep*(i+1));
+        T* d2 = (T*)(dst + dstep*(i+2));
+        T* d3 = (T*)(dst + dstep*(i+3));
+
+        for( j = 0; j <= n - 4; j += 4 )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
+            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
+            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
+            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
+
+            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
+            d1[j] = s0[1]; d1[j+1] = s1[1]; d1[j+2] = s2[1]; d1[j+3] = s3[1];
+            d2[j] = s0[2]; d2[j+1] = s1[2]; d2[j+2] = s2[2]; d2[j+3] = s3[2];
+            d3[j] = s0[3]; d3[j+1] = s1[3]; d3[j+2] = s2[3]; d3[j+3] = s3[3];
+        }
+
+        for( ; j < n; j++ )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
+            d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
+        }
+    }
+    #endif
+    for( ; i < m; i++ )
+    {
+        T* d0 = (T*)(dst + dstep*i);
+        j = 0;
+        #if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
+            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
+            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
+            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
+
+            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
+        }
+        #endif
+        for( ; j < n; j++ )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
+            d0[j] = s0[0];
+        }
+    }
+}
+
+template<typename T> static void
+transposeI_( uchar* data, size_t step, int n )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        T* row = (T*)(data + step*i);
+        uchar* data1 = data + i*sizeof(T);
+        for( int j = i+1; j < n; j++ )
+            std::swap( row[j], *(T*)(data1 + step*j) );
+    }
+}
+
+typedef void (*TransposeFunc)( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz );
+typedef void (*TransposeInplaceFunc)( uchar* data, size_t step, int n );
+
+#define DEF_TRANSPOSE_FUNC(suffix, type) \
+static void transpose_##suffix( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) \
+{ transpose_<type>(src, sstep, dst, dstep, sz); } \
+\
+static void transposeI_##suffix( uchar* data, size_t step, int n ) \
+{ transposeI_<type>(data, step, n); }
+
+DEF_TRANSPOSE_FUNC(8u, uchar)
+DEF_TRANSPOSE_FUNC(16u, ushort)
+DEF_TRANSPOSE_FUNC(8uC3, Vec3b)
+DEF_TRANSPOSE_FUNC(32s, int)
+DEF_TRANSPOSE_FUNC(16uC3, Vec3s)
+DEF_TRANSPOSE_FUNC(32sC2, Vec2i)
+DEF_TRANSPOSE_FUNC(32sC3, Vec3i)
+DEF_TRANSPOSE_FUNC(32sC4, Vec4i)
+DEF_TRANSPOSE_FUNC(32sC6, Vec6i)
+DEF_TRANSPOSE_FUNC(32sC8, Vec8i)
+
+static TransposeFunc transposeTab[] =
+{
+    0, transpose_8u, transpose_16u, transpose_8uC3, transpose_32s, 0, transpose_16uC3, 0,
+    transpose_32sC2, 0, 0, 0, transpose_32sC3, 0, 0, 0, transpose_32sC4,
+    0, 0, 0, 0, 0, 0, 0, transpose_32sC6, 0, 0, 0, 0, 0, 0, 0, transpose_32sC8
+};
+
+static TransposeInplaceFunc transposeInplaceTab[] =
+{
+    0, transposeI_8u, transposeI_16u, transposeI_8uC3, transposeI_32s, 0, transposeI_16uC3, 0,
+    transposeI_32sC2, 0, 0, 0, transposeI_32sC3, 0, 0, 0, transposeI_32sC4,
+    0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_transpose( InputArray _src, OutputArray _dst )
+{
+    const ocl::Device & dev = ocl::Device::getDefault();
+    const int TILE_DIM = 32, BLOCK_ROWS = 8;
+    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
+        rowsPerWI = dev.isIntel() ? 4 : 1;
+
+    UMat src = _src.getUMat();
+    _dst.create(src.cols, src.rows, type);
+    UMat dst = _dst.getUMat();
+
+    String kernelName("transpose");
+    bool inplace = dst.u == src.u;
+
+    if (inplace)
+    {
+        CV_Assert(dst.cols == dst.rows);
+        kernelName += "_inplace";
+    }
+    else
+    {
+        // check required local memory size
+        size_t required_local_memory = (size_t) TILE_DIM*(TILE_DIM+1)*CV_ELEM_SIZE(type);
+        if (required_local_memory > ocl::Device::getDefault().localMemSize())
+            return false;
+    }
+
+    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
+                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d%s",
+                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
+                         cn, TILE_DIM, BLOCK_ROWS, rowsPerWI, inplace ? " -D INPLACE" : ""));
+    if (k.empty())
+        return false;
+
+    if (inplace)
+        k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
+    else
+        k.args(ocl::KernelArg::ReadOnly(src),
+               ocl::KernelArg::WriteOnlyNoSize(dst));
+
+    size_t localsize[2]  = { TILE_DIM, BLOCK_ROWS };
+    size_t globalsize[2] = { (size_t)src.cols, inplace ? ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI : (divUp((size_t)src.rows, TILE_DIM) * BLOCK_ROWS) };
+
+    if (inplace && dev.isIntel())
+    {
+        localsize[0] = 16;
+        localsize[1] = dev.maxWorkGroupSize() / localsize[0];
+    }
+
+    return k.run(2, globalsize, localsize, false);
+}
+
+#endif
+
+#ifdef HAVE_IPP
+static bool ipp_transpose( Mat &src, Mat &dst )
+{
+    CV_INSTRUMENT_REGION_IPP();
+
+    int type = src.type();
+    typedef IppStatus (CV_STDCALL * IppiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
+    typedef IppStatus (CV_STDCALL * IppiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
+    IppiTranspose ippiTranspose = 0;
+    IppiTransposeI ippiTranspose_I = 0;
+
+    if (dst.data == src.data && dst.cols == dst.rows)
+    {
+        CV_SUPPRESS_DEPRECATED_START
+        ippiTranspose_I =
+            type == CV_8UC1 ? (IppiTransposeI)ippiTranspose_8u_C1IR :
+            type == CV_8UC3 ? (IppiTransposeI)ippiTranspose_8u_C3IR :
+            type == CV_8UC4 ? (IppiTransposeI)ippiTranspose_8u_C4IR :
+            type == CV_16UC1 ? (IppiTransposeI)ippiTranspose_16u_C1IR :
+            type == CV_16UC3 ? (IppiTransposeI)ippiTranspose_16u_C3IR :
+            type == CV_16UC4 ? (IppiTransposeI)ippiTranspose_16u_C4IR :
+            type == CV_16SC1 ? (IppiTransposeI)ippiTranspose_16s_C1IR :
+            type == CV_16SC3 ? (IppiTransposeI)ippiTranspose_16s_C3IR :
+            type == CV_16SC4 ? (IppiTransposeI)ippiTranspose_16s_C4IR :
+            type == CV_32SC1 ? (IppiTransposeI)ippiTranspose_32s_C1IR :
+            type == CV_32SC3 ? (IppiTransposeI)ippiTranspose_32s_C3IR :
+            type == CV_32SC4 ? (IppiTransposeI)ippiTranspose_32s_C4IR :
+            type == CV_32FC1 ? (IppiTransposeI)ippiTranspose_32f_C1IR :
+            type == CV_32FC3 ? (IppiTransposeI)ippiTranspose_32f_C3IR :
+            type == CV_32FC4 ? (IppiTransposeI)ippiTranspose_32f_C4IR : 0;
+        CV_SUPPRESS_DEPRECATED_END
+    }
+    else
+    {
+        ippiTranspose =
+            type == CV_8UC1 ? (IppiTranspose)ippiTranspose_8u_C1R :
+            type == CV_8UC3 ? (IppiTranspose)ippiTranspose_8u_C3R :
+            type == CV_8UC4 ? (IppiTranspose)ippiTranspose_8u_C4R :
+            type == CV_16UC1 ? (IppiTranspose)ippiTranspose_16u_C1R :
+            type == CV_16UC3 ? (IppiTranspose)ippiTranspose_16u_C3R :
+            type == CV_16UC4 ? (IppiTranspose)ippiTranspose_16u_C4R :
+            type == CV_16SC1 ? (IppiTranspose)ippiTranspose_16s_C1R :
+            type == CV_16SC3 ? (IppiTranspose)ippiTranspose_16s_C3R :
+            type == CV_16SC4 ? (IppiTranspose)ippiTranspose_16s_C4R :
+            type == CV_32SC1 ? (IppiTranspose)ippiTranspose_32s_C1R :
+            type == CV_32SC3 ? (IppiTranspose)ippiTranspose_32s_C3R :
+            type == CV_32SC4 ? (IppiTranspose)ippiTranspose_32s_C4R :
+            type == CV_32FC1 ? (IppiTranspose)ippiTranspose_32f_C1R :
+            type == CV_32FC3 ? (IppiTranspose)ippiTranspose_32f_C3R :
+            type == CV_32FC4 ? (IppiTranspose)ippiTranspose_32f_C4R : 0;
+    }
+
+    IppiSize roiSize = { src.cols, src.rows };
+    if (ippiTranspose != 0)
+    {
+        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, roiSize) >= 0)
+            return true;
+    }
+    else if (ippiTranspose_I != 0)
+    {
+        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose_I, dst.ptr(), (int)dst.step, roiSize) >= 0)
+            return true;
+    }
+    return false;
+}
+#endif
+
+
+void transpose( InputArray _src, OutputArray _dst )
+{
+    CV_INSTRUMENT_REGION();
+
+    int type = _src.type(), esz = CV_ELEM_SIZE(type);
+    CV_Assert( _src.dims() <= 2 && esz <= 32 );
+
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_transpose(_src, _dst))
+
+    Mat src = _src.getMat();
+    if( src.empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    _dst.create(src.cols, src.rows, src.type());
+    Mat dst = _dst.getMat();
+
+    // handle the case of single-column/single-row matrices, stored in STL vectors.
+    if( src.rows != dst.cols || src.cols != dst.rows )
+    {
+        CV_Assert( src.size() == dst.size() && (src.cols == 1 || src.rows == 1) );
+        src.copyTo(dst);
+        return;
+    }
+
+    CV_IPP_RUN_FAST(ipp_transpose(src, dst))
+
+    if( dst.data == src.data )
+    {
+        TransposeInplaceFunc func = transposeInplaceTab[esz];
+        CV_Assert( func != 0 );
+        CV_Assert( dst.cols == dst.rows );
+        func( dst.ptr(), dst.step, dst.rows );
+    }
+    else
+    {
+        TransposeFunc func = transposeTab[esz];
+        CV_Assert( func != 0 );
+        func( src.ptr(), src.step, dst.ptr(), dst.step, src.size() );
+    }
+}
+
+
+#if CV_SIMD128
+template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    typedef typename V::lane_type T;
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+    int width_1 = width & -v_uint8x16::nlanes;
+    int i, j;
+
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T)>(src, dst));
+#endif
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        {
+            V t0, t1;
+
+            t0 = v_load((T*)((uchar*)src + i));
+            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t0 = v_reverse(t0);
+            t1 = v_reverse(t1);
+            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + i), t1);
+        }
+        if (isAligned<sizeof(T)>(src, dst))
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                T t0, t1;
+
+                t0 = *((T*)((uchar*)src + i));
+                t1 = *((T*)((uchar*)src + j - sizeof(T)));
+                *((T*)(dst + j - sizeof(T))) = t0;
+                *((T*)(dst + i)) = t1;
+            }
+        }
+        else
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                for (int k = 0; k < (int)sizeof(T); k++)
+                {
+                    uchar t0, t1;
+
+                    t0 = *((uchar*)src + i + k);
+                    t1 = *((uchar*)src + j + k - sizeof(T));
+                    *(dst + j + k - sizeof(T)) = t0;
+                    *(dst + i + k) = t1;
+                }
+            }
+        }
+    }
+}
+
+template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T1)>(src, dst));
+    CV_Assert(isAligned<sizeof(T2)>(src, dst));
+#endif
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
+        {
+            T1 t0, t1;
+            T2 t2, t3;
+
+            t0 = *((T1*)((uchar*)src + i));
+            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
+            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
+            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
+            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
+            *((T2*)(dst + j - sizeof(T2))) = t2;
+            *((T1*)(dst + i)) = t1;
+            *((T2*)(dst + i + sizeof(T1))) = t3;
+        }
+    }
+}
+#endif
+
+static void
+flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+#if CV_SIMD
+#if CV_STRONG_ALIGNMENT
+    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
+#endif
+    if (esz == 2 * v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            {
+#if CV_SIMD256
+                v_uint8x32 t0, t1;
+
+                t0 = v256_load((uchar*)src + i);
+                t1 = v256_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+#else
+                v_uint8x16 t0, t1, t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t2 = v_load((uchar*)src + j);
+                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                v_store(dst + j, t0);
+                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + i, t2);
+                v_store(dst + i + v_uint8x16::nlanes, t3);
+#endif
+            }
+        }
+    }
+    else if (esz == v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            {
+                v_uint8x16 t0, t1;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+            }
+        }
+    }
+    else if (esz == 8
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(uint64)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 4
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(unsigned)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 2
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(ushort)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 1)
+    {
+        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 24
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(uint64_t)>(alignmentMark)
+#endif
+    )
+    {
+        int end = (int)(size.width*esz);
+        int width = (end + 1)/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            {
+                v_uint8x16 t0, t1;
+                uint64_t t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
+                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
+                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
+                v_store(dst + i, t1);
+                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+            }
+        }
+    }
+#if !CV_STRONG_ALIGNMENT
+    else if (esz == 12)
+    {
+        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 6)
+    {
+        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 3)
+    {
+        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
+    }
+#endif
+    else
+#endif // CV_SIMD
+    {
+        int i, j, limit = (int)(((size.width + 1)/2)*esz);
+        AutoBuffer<int> _tab(size.width*esz);
+        int* tab = _tab.data();
+
+        for( i = 0; i < size.width; i++ )
+            for( size_t k = 0; k < esz; k++ )
+                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( i = 0; i < limit; i++ )
+            {
+                j = tab[i];
+                uchar t0 = src[i], t1 = src[j];
+                dst[i] = t1; dst[j] = t0;
+            }
+        }
+    }
+}
+
+static void
+flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, size_t esz )
+{
+    const uchar* src1 = src0 + (size.height - 1)*sstep;
+    uchar* dst1 = dst0 + (size.height - 1)*dstep;
+    size.width *= (int)esz;
+
+    for( int y = 0; y < (size.height + 1)/2; y++, src0 += sstep, src1 -= sstep,
+                                                  dst0 += dstep, dst1 -= dstep )
+    {
+        int i = 0;
+#if CV_SIMD
+#if CV_STRONG_ALIGNMENT
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
+#endif
+        {
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_int32 t0 = vx_load((int*)(src0 + i));
+                v_int32 t1 = vx_load((int*)(src1 + i));
+                v_store((int*)(dst0 + i), t1);
+                v_store((int*)(dst1 + i), t0);
+            }
+        }
+#if CV_STRONG_ALIGNMENT
+        else
+        {
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_uint8 t0 = vx_load(src0 + i);
+                v_uint8 t1 = vx_load(src1 + i);
+                v_store(dst0 + i, t1);
+                v_store(dst1 + i, t0);
+            }
+        }
+#endif
+#endif
+
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
+        {
+            for( ; i <= size.width - 16; i += 16 )
+            {
+                int t0 = ((int*)(src0 + i))[0];
+                int t1 = ((int*)(src1 + i))[0];
+
+                ((int*)(dst0 + i))[0] = t1;
+                ((int*)(dst1 + i))[0] = t0;
+
+                t0 = ((int*)(src0 + i))[1];
+                t1 = ((int*)(src1 + i))[1];
+
+                ((int*)(dst0 + i))[1] = t1;
+                ((int*)(dst1 + i))[1] = t0;
+
+                t0 = ((int*)(src0 + i))[2];
+                t1 = ((int*)(src1 + i))[2];
+
+                ((int*)(dst0 + i))[2] = t1;
+                ((int*)(dst1 + i))[2] = t0;
+
+                t0 = ((int*)(src0 + i))[3];
+                t1 = ((int*)(src1 + i))[3];
+
+                ((int*)(dst0 + i))[3] = t1;
+                ((int*)(dst1 + i))[3] = t0;
+            }
+
+            for( ; i <= size.width - 4; i += 4 )
+            {
+                int t0 = ((int*)(src0 + i))[0];
+                int t1 = ((int*)(src1 + i))[0];
+
+                ((int*)(dst0 + i))[0] = t1;
+                ((int*)(dst1 + i))[0] = t0;
+            }
+        }
+
+        for( ; i < size.width; i++ )
+        {
+            uchar t0 = src0[i];
+            uchar t1 = src1[i];
+
+            dst0[i] = t1;
+            dst1[i] = t0;
+        }
+    }
+}
+
+#ifdef HAVE_OPENCL
+
+enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
+
+static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
+{
+    CV_Assert(flipCode >= -1 && flipCode <= 1);
+
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
+
+    bool doubleSupport = dev.doubleFPConfig() > 0;
+    if (!doubleSupport && depth == CV_64F)
+        kercn = cn;
+
+    if (cn > 4)
+        return false;
+
+    const char * kernelName;
+    if (flipCode == 0)
+        kernelName = "arithm_flip_rows", flipType = FLIP_ROWS;
+    else if (flipCode > 0)
+        kernelName = "arithm_flip_cols", flipType = FLIP_COLS;
+    else
+        kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
+
+    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
+    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
+
+    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
+        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
+                kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
+                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
+    if (k.empty())
+        return false;
+
+    Size size = _src.size();
+    _dst.create(size, type);
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+
+    int cols = size.width * cn / kercn, rows = size.height;
+    cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols;
+    rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src),
+           ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols);
+
+    size_t maxWorkGroupSize = dev.maxWorkGroupSize();
+    CV_Assert(maxWorkGroupSize % 4 == 0);
+
+    size_t globalsize[2] = { (size_t)cols, ((size_t)rows + pxPerWIy - 1) / pxPerWIy },
+            localsize[2] = { maxWorkGroupSize / 4, 4 };
+    return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false);
+}
+
+#endif
+
+#if defined HAVE_IPP
+static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
+{
+#ifdef HAVE_IPP_IW
+    CV_INSTRUMENT_REGION_IPP();
+
+    // Details: https://github.com/opencv/opencv/issues/12943
+    if (flip_mode <= 0 /* swap rows */
+        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
+        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
+    )
+        return false;
+
+    IppiAxis ippMode;
+    if(flip_mode < 0)
+        ippMode = ippAxsBoth;
+    else if(flip_mode == 0)
+        ippMode = ippAxsHorizontal;
+    else
+        ippMode = ippAxsVertical;
+
+    try
+    {
+        ::ipp::IwiImage iwSrc = ippiGetImage(src);
+        ::ipp::IwiImage iwDst = ippiGetImage(dst);
+
+        CV_INSTRUMENT_FUN_IPP(::ipp::iwiMirror, iwSrc, iwDst, ippMode);
+    }
+    catch(const ::ipp::IwException &)
+    {
+        return false;
+    }
+
+    return true;
+#else
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(flip_mode);
+    return false;
+#endif
+}
+#endif
+
+
+void flip( InputArray _src, OutputArray _dst, int flip_mode )
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert( _src.dims() <= 2 );
+    Size size = _src.size();
+
+    if (flip_mode < 0)
+    {
+        if (size.width == 1)
+            flip_mode = 0;
+        if (size.height == 1)
+            flip_mode = 1;
+    }
+
+    if ((size.width == 1 && flip_mode > 0) ||
+        (size.height == 1 && flip_mode == 0))
+    {
+        return _src.copyTo(_dst);
+    }
+
+    CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src, _dst, flip_mode))
+
+    Mat src = _src.getMat();
+    int type = src.type();
+    _dst.create( size, type );
+    Mat dst = _dst.getMat();
+
+    CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
+
+    size_t esz = CV_ELEM_SIZE(type);
+
+    if( flip_mode <= 0 )
+        flipVert( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
+    else
+        flipHoriz( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
+
+    if( flip_mode < 0 )
+        flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
+}
+
+void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+{
+    CV_Assert(_src.dims() <= 2);
+
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        transpose(_src, _dst);
+        flip(_dst, _dst, 1);
+        break;
+    case ROTATE_180:
+        flip(_src, _dst, -1);
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        transpose(_src, _dst);
+        flip(_dst, _dst, 0);
+        break;
+    default:
+        break;
+    }
+}
+
+}  // namespace
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index 68a674f6f1..bb61ce2de1 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -2,9 +2,8 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
-#include "opencv2/core/mat.hpp"
 #include "precomp.hpp"
+#include "opencv2/core/mat.hpp"
 
 namespace cv {
 
@@ -33,7 +32,7 @@ Mat _InputArray::getMat_(int i) const
         return m->getMat(accessFlags).row(i);
     }
 
-    if( k == MATX || k == STD_ARRAY )
+    if (k == MATX)
     {
         CV_Assert( i < 0 );
         return Mat(sz, flags, obj);
@@ -173,7 +172,7 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
         return;
     }
 
-    if( k == MATX || k == STD_ARRAY )
+    if (k == MATX)
     {
         size_t n = sz.height, esz = CV_ELEM_SIZE(flags);
         mv.resize(n);
@@ -317,6 +316,7 @@ void _InputArray::getUMatVector(std::vector<UMat>& umv) const
 
 cuda::GpuMat _InputArray::getGpuMat() const
 {
+#ifdef HAVE_CUDA
     _InputArray::KindFlag k = kind();
 
     if (k == CUDA_GPU_MAT)
@@ -340,14 +340,22 @@ cuda::GpuMat _InputArray::getGpuMat() const
         return cuda::GpuMat();
 
     CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::HostMem");
+#else
+    CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
 }
 void _InputArray::getGpuMatVector(std::vector<cuda::GpuMat>& gpumv) const
 {
+#ifdef HAVE_CUDA
     _InputArray::KindFlag k = kind();
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
         gpumv = *(std::vector<cuda::GpuMat>*)obj;
     }
+#else
+    CV_UNUSED(gpumv);
+    CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
 }
 ogl::Buffer _InputArray::getOGlBuffer() const
 {
@@ -362,7 +370,10 @@ ogl::Buffer _InputArray::getOGlBuffer() const
 _InputArray::KindFlag _InputArray::kind() const
 {
     KindFlag k = flags & KIND_MASK;
+#if CV_VERSION_MAJOR < 5
     CV_DbgAssert(k != EXPR);
+    CV_DbgAssert(k != STD_ARRAY);
+#endif
     return k;
 }
 
@@ -392,7 +403,7 @@ Size _InputArray::size(int i) const
         return ((const UMat*)obj)->size();
     }
 
-    if( k == MATX || k == STD_ARRAY )
+    if (k == MATX)
     {
         CV_Assert( i < 0 );
         return sz;
@@ -451,11 +462,15 @@ Size _InputArray::size(int i) const
 
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
         if (i < 0)
             return vv.empty() ? Size() : Size((int)vv.size(), 1);
         CV_Assert(i < (int)vv.size());
         return vv[i].size();
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == STD_VECTOR_UMAT )
@@ -612,7 +627,7 @@ int _InputArray::dims(int i) const
         return ((const UMat*)obj)->dims;
     }
 
-    if( k == MATX || k == STD_ARRAY )
+    if (k == MATX)
     {
         CV_Assert( i < 0 );
         return 2;
@@ -746,7 +761,7 @@ int _InputArray::type(int i) const
     if( k == UMAT )
         return ((const UMat*)obj)->type();
 
-    if( k == MATX || k == STD_VECTOR || k == STD_ARRAY || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
+    if( k == MATX || k == STD_VECTOR || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return CV_MAT_TYPE(flags);
 
     if( k == NONE )
@@ -790,6 +805,7 @@ int _InputArray::type(int i) const
 
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
         if (vv.empty())
         {
@@ -798,6 +814,9 @@ int _InputArray::type(int i) const
         }
         CV_Assert(i < (int)vv.size());
         return vv[i >= 0 ? i : 0].type();
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == OPENGL_BUFFER )
@@ -832,7 +851,7 @@ bool _InputArray::empty() const
     if( k == UMAT )
         return ((const UMat*)obj)->empty();
 
-    if( k == MATX || k == STD_ARRAY )
+    if (k == MATX)
         return false;
 
     if( k == STD_VECTOR )
@@ -901,7 +920,7 @@ bool _InputArray::isContinuous(int i) const
     if( k == UMAT )
         return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
 
-    if( k == MATX || k == STD_VECTOR || k == STD_ARRAY ||
+    if( k == MATX || k == STD_VECTOR ||
         k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return true;
 
@@ -942,7 +961,7 @@ bool _InputArray::isSubmatrix(int i) const
     if( k == UMAT )
         return i < 0 ? ((const UMat*)obj)->isSubmatrix() : false;
 
-    if( k == MATX || k == STD_VECTOR || k == STD_ARRAY ||
+    if( k == MATX || k == STD_VECTOR ||
         k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return false;
 
@@ -987,7 +1006,7 @@ size_t _InputArray::offset(int i) const
         return ((const UMat*)obj)->offset;
     }
 
-    if( k == MATX || k == STD_VECTOR || k == STD_ARRAY ||
+    if( k == MATX || k == STD_VECTOR ||
         k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return 0;
 
@@ -1046,7 +1065,7 @@ size_t _InputArray::step(int i) const
         return ((const UMat*)obj)->step;
     }
 
-    if( k == MATX || k == STD_VECTOR || k == STD_ARRAY ||
+    if( k == MATX || k == STD_VECTOR ||
         k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return 0;
 
@@ -1092,7 +1111,7 @@ void _InputArray::copyTo(const _OutputArray& arr) const
 
     if( k == NONE )
         arr.release();
-    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_ARRAY || k == STD_BOOL_VECTOR )
+    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR )
     {
         Mat m = getMat();
         m.copyTo(arr);
@@ -1113,7 +1132,7 @@ void _InputArray::copyTo(const _OutputArray& arr, const _InputArray & mask) cons
 
     if( k == NONE )
         arr.release();
-    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_ARRAY || k == STD_BOOL_VECTOR )
+    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR )
     {
         Mat m = getMat();
         m.copyTo(arr, mask);
@@ -1159,22 +1178,34 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, _Out
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
     if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     int sizes[] = {_sz.height, _sz.width};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
@@ -1201,22 +1232,34 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
     if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     int sizes[] = {_rows, _cols};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
@@ -1301,16 +1344,27 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i,
         CV_Assert( i < 0 );
         int type0 = CV_MAT_TYPE(flags);
         CV_Assert( mtype == type0 || (CV_MAT_CN(mtype) == 1 && ((1 << type0) & fixedDepthMask) != 0) );
-        CV_Assert( d == 2 && ((sizes[0] == sz.height && sizes[1] == sz.width) ||
-                                 (allowTransposed && sizes[0] == sz.width && sizes[1] == sz.height)));
-        return;
-    }
-
-    if( k == STD_ARRAY )
-    {
-        int type0 = CV_MAT_TYPE(flags);
-        CV_Assert( mtype == type0 || (CV_MAT_CN(mtype) == 1 && ((1 << type0) & fixedDepthMask) != 0) );
-        CV_Assert( d == 2 && sz.area() == sizes[0]*sizes[1]);
+        CV_CheckLE(d, 2, "");
+        Size requested_size(d == 2 ? sizes[1] : 1, d >= 1 ? sizes[0] : 1);
+        if (sz.width == 1 || sz.height == 1)
+        {
+            // NB: 1D arrays assume allowTransposed=true (see #4159)
+            int total_1d = std::max(sz.width, sz.height);
+            CV_Check(requested_size, std::max(requested_size.width, requested_size.height) == total_1d, "");
+        }
+        else
+        {
+            if (!allowTransposed)
+            {
+                CV_CheckEQ(requested_size, sz, "");
+            }
+            else
+            {
+                CV_Check(requested_size,
+                        (requested_size == sz || (requested_size.height == sz.width && requested_size.width == sz.height)),
+                        "");
+            }
+        }
         return;
     }
 
@@ -1628,20 +1682,32 @@ void _OutputArray::release() const
 
     if( k == CUDA_GPU_MAT )
     {
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == CUDA_HOST_MEM )
     {
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == OPENGL_BUFFER )
     {
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
 
     if( k == NONE )
@@ -1672,8 +1738,12 @@ void _OutputArray::release() const
     }
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         ((std::vector<cuda::GpuMat>*)obj)->clear();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
@@ -1772,7 +1842,7 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
 
     if( k == NONE )
         ;
-    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_ARRAY )
+    else if (k == MAT || k == MATX || k == STD_VECTOR)
     {
         Mat m = getMat();
         m.setTo(arr, mask);
@@ -1781,9 +1851,13 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
         ((UMat*)obj)->setTo(arr, mask);
     else if( k == CUDA_GPU_MAT )
     {
+#ifdef HAVE_CUDA
         Mat value = arr.getMat();
         CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::CUDA_GPU_MAT) );
         ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>(value.ptr<double>())), mask);
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     else
         CV_Error(Error::StsNotImplemented, "");
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index 088c163c87..bbefefc95d 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -152,10 +152,10 @@ float normL2Sqr_(const float* a, const float* b, int n)
     {
         v_float32 t0 = vx_load(a + j) - vx_load(b + j);
         v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
-        v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
-        v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
         v_d0 = v_muladd(t0, t0, v_d0);
+        v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
         v_d1 = v_muladd(t1, t1, v_d1);
+        v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
         v_d2 = v_muladd(t2, t2, v_d2);
         v_d3 = v_muladd(t3, t3, v_d3);
     }
@@ -205,13 +205,10 @@ int normL1_(const uchar* a, const uchar* b, int n)
     return d;
 }
 
-}} //cv::hal
+} //cv::hal
 
 //==================================================================================================
 
-namespace cv
-{
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -594,12 +591,10 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
     CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
 #endif
     return false;
-}
-#endif
+}  // ipp_norm()
+#endif  // HAVE_IPP
 
-} // cv::
-
-double cv::norm( InputArray _src, int normType, InputArray _mask )
+double norm( InputArray _src, int normType, InputArray _mask )
 {
     CV_INSTRUMENT_REGION();
 
@@ -792,9 +787,6 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
 //==================================================================================================
 
 #ifdef HAVE_OPENCL
-
-namespace cv {
-
 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
 {
 #ifdef __ANDROID__
@@ -849,15 +841,10 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
         result /= (s2 + DBL_EPSILON);
 
     return true;
-}
-
-}
-
-#endif
+}  // ocl_norm()
+#endif  // HAVE_OPENCL
 
 #ifdef HAVE_IPP
-namespace cv
-{
 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
 {
     CV_INSTRUMENT_REGION_IPP();
@@ -1083,12 +1070,11 @@ static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArra
     CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
 #endif
     return false;
-}
-}
-#endif
+}  // ipp_norm
+#endif  // HAVE_IPP
 
 
-double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
+double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1280,12 +1266,12 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
     return result.d;
 }
 
-cv::Hamming::ResultType cv::Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
+cv::Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
 {
     return cv::hal::normHamming(a, b, size);
 }
 
-double cv::PSNR(InputArray _src1, InputArray _src2, double R)
+double PSNR(InputArray _src1, InputArray _src2, double R)
 {
     CV_INSTRUMENT_REGION();
 
@@ -1295,3 +1281,141 @@ double cv::PSNR(InputArray _src1, InputArray _src2, double R)
     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
     return 20*log10(R/(diff+DBL_EPSILON));
 }
+
+
+#ifdef HAVE_OPENCL
+static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
+                           double scale, double delta )
+{
+    UMat src = _src.getUMat();
+
+    if( _mask.empty() )
+        src.convertTo( _dst, dtype, scale, delta );
+    else if (src.channels() <= 4)
+    {
+        const ocl::Device & dev = ocl::Device::getDefault();
+
+        int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+                ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
+                rowsPerWI = dev.isIntel() ? 4 : 1;
+
+        float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
+        bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
+                haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
+                haveDelta = std::fabs(delta) > DBL_EPSILON,
+                doubleSupport = dev.doubleFPConfig() > 0;
+
+        if (!haveScale && !haveDelta && stype == dtype)
+        {
+            _src.copyTo(_dst, _mask);
+            return true;
+        }
+        if (haveZeroScale)
+        {
+            _dst.setTo(Scalar(delta), _mask);
+            return true;
+        }
+
+        if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
+            return false;
+
+        char cvt[2][40];
+        String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
+                             " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
+                             ocl::typeToStr(stype), ocl::typeToStr(dtype),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
+                             rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                             haveScale ? " -D HAVE_SCALE" : "",
+                             haveDelta ? " -D HAVE_DELTA" : "",
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
+
+        ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
+        if (k.empty())
+            return false;
+
+        UMat mask = _mask.getUMat(), dst = _dst.getUMat();
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
+                dstarg = ocl::KernelArg::ReadWrite(dst);
+
+        if (haveScale)
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fscale, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg, fscale);
+        }
+        else
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg);
+        }
+
+        size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
+        return k.run(2, globalsize, NULL, false);
+    }
+    else
+    {
+        UMat temp;
+        src.convertTo( temp, dtype, scale, delta );
+        temp.copyTo( _dst, _mask );
+    }
+
+    return true;
+}  // ocl_normalize
+#endif  // HAVE_OPENCL
+
+void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
+               int norm_type, int rtype, InputArray _mask)
+{
+    CV_INSTRUMENT_REGION();
+
+    double scale = 1, shift = 0;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type);
+
+    if( rtype < 0 )
+        rtype = _dst.fixedType() ? _dst.depth() : depth;
+
+    if( norm_type == CV_MINMAX )
+    {
+        double smin = 0, smax = 0;
+        double dmin = MIN( a, b ), dmax = MAX( a, b );
+        minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
+        scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
+        if( rtype == CV_32F )
+        {
+            scale = (float)scale;
+            shift = (float)dmin - (float)(smin*scale);
+        }
+        else
+            shift = dmin - smin*scale;
+    }
+    else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
+    {
+        scale = norm( _src, norm_type, _mask );
+        scale = scale > DBL_EPSILON ? a/scale : 0.;
+        shift = 0;
+    }
+    else
+        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
+
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
+
+    Mat src = _src.getMat();
+    if( _mask.empty() )
+        src.convertTo( _dst, rtype, scale, shift );
+    else
+    {
+        Mat temp;
+        src.convertTo( temp, rtype, scale, shift );
+        temp.copyTo( _dst, _mask );
+    }
+}
+
+}  // namespace
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 44ee8f9c59..ac52eeaf99 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1149,14 +1149,14 @@ void OpenCLExecutionContext::release()
 }
 
 
+
 // true if we have initialized OpenCL subsystem with available platforms
-static bool g_isOpenCVActivated = false;
+static bool g_isOpenCLInitialized = false;
+static bool g_isOpenCLAvailable = false;
 
 bool haveOpenCL()
 {
     CV_TRACE_FUNCTION();
-    static bool g_isOpenCLInitialized = false;
-    static bool g_isOpenCLAvailable = false;
 
     if (!g_isOpenCLInitialized)
     {
@@ -1178,7 +1178,7 @@ bool haveOpenCL()
         {
             cl_uint n = 0;
             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
-            g_isOpenCVActivated = n > 0;
+            g_isOpenCLAvailable &= n > 0;
             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
         }
         catch (...)
@@ -1214,7 +1214,7 @@ bool useOpenCL()
 
 bool isOpenCLActivated()
 {
-    if (!g_isOpenCVActivated)
+    if (!g_isOpenCLAvailable)
         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
     return useOpenCL();
 }
@@ -1451,7 +1451,7 @@ struct Platform::Impl
     bool initialized;
 };
 
-Platform::Platform()
+Platform::Platform() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -1480,6 +1480,23 @@ Platform& Platform::operator = (const Platform& pl)
     return *this;
 }
 
+Platform::Platform(Platform&& pl) CV_NOEXCEPT
+{
+    p = pl.p;
+    pl.p = nullptr;
+}
+
+Platform& Platform::operator = (Platform&& pl) CV_NOEXCEPT
+{
+    if (this != &pl) {
+        if(p)
+            p->release();
+        p = pl.p;
+        pl.p = nullptr;
+    }
+    return *this;
+}
+
 void* Platform::ptr() const
 {
     return p ? p->handle : 0;
@@ -1499,25 +1516,27 @@ Platform& Platform::getDefault()
 
 /////////////////////////////////////// Device ////////////////////////////////////////////
 
-// deviceVersion has format
+// Version has format:
 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
 // by specification
 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
-static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
+//   https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
+//   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
+static void parseOpenCLVersion(const String &version, int &major, int &minor)
 {
     major = minor = 0;
-    if (10 >= deviceVersion.length())
+    if (10 >= version.length())
         return;
-    const char *pstr = deviceVersion.c_str();
+    const char *pstr = version.c_str();
     if (0 != strncmp(pstr, "OpenCL ", 7))
         return;
-    size_t ppos = deviceVersion.find('.', 7);
+    size_t ppos = version.find('.', 7);
     if (String::npos == ppos)
         return;
-    String temp = deviceVersion.substr(7, ppos - 7);
+    String temp = version.substr(7, ppos - 7);
     major = atoi(temp.c_str());
-    temp = deviceVersion.substr(ppos + 1);
+    temp = version.substr(ppos + 1);
     minor = atoi(temp.c_str());
 }
 
@@ -1555,7 +1574,7 @@ struct Device::Impl
         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
 
         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
-        parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
+        parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
 
         size_t pos = 0;
         while (pos < extensions_.size())
@@ -1675,7 +1694,7 @@ struct Device::Impl
 };
 
 
-Device::Device()
+Device::Device() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -1704,6 +1723,23 @@ Device& Device::operator = (const Device& d)
     return *this;
 }
 
+Device::Device(Device&& d) CV_NOEXCEPT
+{
+    p = d.p;
+    d.p = nullptr;
+}
+
+Device& Device::operator = (Device&& d) CV_NOEXCEPT
+{
+    if (this != &d) {
+        if(p)
+            p->release();
+        p = d.p;
+        d.p = nullptr;
+    }
+    return *this;
+}
+
 Device::~Device()
 {
     if(p)
@@ -2832,7 +2868,7 @@ public:
 };
 
 
-Context::Context()
+Context::Context() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -2917,6 +2953,23 @@ Context& Context::operator = (const Context& c)
     return *this;
 }
 
+Context::Context(Context&& c) CV_NOEXCEPT
+{
+    p = c.p;
+    c.p = nullptr;
+}
+
+Context& Context::operator = (Context&& c) CV_NOEXCEPT
+{
+    if (this != &c) {
+        if(p)
+            p->release();
+        p = c.p;
+        c.p = nullptr;
+    }
+    return *this;
+}
+
 void* Context::ptr() const
 {
     return p == NULL ? NULL : p->handle;
@@ -3229,7 +3282,7 @@ struct Queue::Impl
     cv::ocl::Queue profiling_queue_;
 };
 
-Queue::Queue()
+Queue::Queue() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -3258,6 +3311,23 @@ Queue& Queue::operator = (const Queue& q)
     return *this;
 }
 
+Queue::Queue(Queue&& q) CV_NOEXCEPT
+{
+    p = q.p;
+    q.p = nullptr;
+}
+
+Queue& Queue::operator = (Queue&& q) CV_NOEXCEPT
+{
+    if (this != &q) {
+        if(p)
+            p->release();
+        p = q.p;
+        q.p = nullptr;
+    }
+    return *this;
+}
+
 Queue::~Queue()
 {
     if(p)
@@ -3313,7 +3383,7 @@ static cl_command_queue getQueue(const Queue& q)
 
 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
 
-KernelArg::KernelArg()
+KernelArg::KernelArg() CV_NOEXCEPT
     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
 {
 }
@@ -3380,16 +3450,24 @@ struct Kernel::Impl
             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
     }
 
-    void addImage(const Image2D& image)
+    /// Preserve image lifetime (while it is specified as Kernel argument)
+    void registerImageArgument(int arg, const Image2D& image)
     {
-        images.push_back(image);
+        CV_CheckGE(arg, 0, "");
+        CV_CheckLT(arg, (int)MAX_ARRS, "");
+        if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr())  // TODO future: replace ptr => impl (more strong check)
+        {
+            CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed");
+        }
+        shadow_images.reserve(MAX_ARRS);
+        shadow_images.resize(std::max(shadow_images.size(), (size_t)arg + 1));
+        shadow_images[arg] = image;
     }
 
     void finit(cl_event e)
     {
         CV_UNUSED(e);
         cleanupUMats();
-        images.clear();
         isInProgress = false;
         release();
     }
@@ -3414,7 +3492,7 @@ struct Kernel::Impl
     bool isInProgress;
     bool isAsyncRun;  // true if kernel was scheduled in async mode
     int nu;
-    std::list<Image2D> images;
+    std::vector<Image2D> shadow_images;
     bool haveTempDstUMats;
     bool haveTempSrcUMats;
 };
@@ -3447,7 +3525,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
 
 namespace cv { namespace ocl {
 
-Kernel::Kernel()
+Kernel::Kernel() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -3483,6 +3561,23 @@ Kernel& Kernel::operator = (const Kernel& k)
     return *this;
 }
 
+Kernel::Kernel(Kernel&& k) CV_NOEXCEPT
+{
+    p = k.p;
+    k.p = nullptr;
+}
+
+Kernel& Kernel::operator = (Kernel&& k) CV_NOEXCEPT
+{
+    if (this != &k) {
+        if(p)
+            p->release();
+        p = k.p;
+        k.p = nullptr;
+    }
+    return *this;
+}
+
 Kernel::~Kernel()
 {
     if(p)
@@ -3529,6 +3624,15 @@ bool Kernel::empty() const
     return ptr() == 0;
 }
 
+static cv::String dumpValue(size_t sz, const void* p)
+{
+    if (sz == 4)
+        return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
+    if (sz == 8)
+        return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
+    return cv::format("%p", p);
+}
+
 int Kernel::set(int i, const void* value, size_t sz)
 {
     if (!p || !p->handle)
@@ -3539,7 +3643,7 @@ int Kernel::set(int i, const void* value, size_t sz)
         p->cleanupUMats();
 
     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
-    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
+    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
     if (retval != CL_SUCCESS)
         return -1;
     return i+1;
@@ -3547,9 +3651,11 @@ int Kernel::set(int i, const void* value, size_t sz)
 
 int Kernel::set(int i, const Image2D& image2D)
 {
-    p->addImage(image2D);
     cl_mem h = (cl_mem)image2D.ptr();
-    return set(i, &h, sizeof(h));
+    int res = set(i, &h, sizeof(h));
+    if (res >= 0)
+        p->registerImageArgument(i, image2D);
+    return res;
 }
 
 int Kernel::set(int i, const UMat& m)
@@ -4026,7 +4132,7 @@ struct ProgramSource::Impl
 };
 
 
-ProgramSource::ProgramSource()
+ProgramSource::ProgramSource() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -4070,6 +4176,23 @@ ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
     return *this;
 }
 
+ProgramSource::ProgramSource(ProgramSource&& prog) CV_NOEXCEPT
+{
+    p = prog.p;
+    prog.p = nullptr;
+}
+
+ProgramSource& ProgramSource::operator = (ProgramSource&& prog) CV_NOEXCEPT
+{
+    if (this != &prog) {
+        if(p)
+            p->release();
+        p = prog.p;
+        prog.p = nullptr;
+    }
+    return *this;
+}
+
 const String& ProgramSource::source() const
 {
     CV_Assert(p);
@@ -4535,7 +4658,10 @@ struct Program::Impl
 };
 
 
-Program::Program() { p = 0; }
+Program::Program() CV_NOEXCEPT
+{
+    p = 0;
+}
 
 Program::Program(const ProgramSource& src,
         const String& buildflags, String& errmsg)
@@ -4562,6 +4688,23 @@ Program& Program::operator = (const Program& prog)
     return *this;
 }
 
+Program::Program(Program&& prog) CV_NOEXCEPT
+{
+    p = prog.p;
+    prog.p = nullptr;
+}
+
+Program& Program::operator = (Program&& prog) CV_NOEXCEPT
+{
+    if (this != &prog) {
+        if(p)
+            p->release();
+        p = prog.p;
+        prog.p = nullptr;
+    }
+    return *this;
+}
+
 Program::~Program()
 {
     if(p)
@@ -6370,7 +6513,6 @@ public:
 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
 {
     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
-    g_isOpenCVActivated = true;
     return g_allocator;
 }
 MatAllocator* getOpenCLAllocator()
@@ -6566,6 +6708,9 @@ struct PlatformInfo::Impl
         refcount = 1;
         handle = *(cl_platform_id*)id;
         getDevices(devices, handle);
+
+        version_ = getStrProp(CL_PLATFORM_VERSION);
+        parseOpenCLVersion(version_, versionMajor_, versionMinor_);
     }
 
     String getStrProp(cl_platform_info prop) const
@@ -6579,9 +6724,13 @@ struct PlatformInfo::Impl
     IMPLEMENT_REFCOUNTABLE();
     std::vector<cl_device_id> devices;
     cl_platform_id handle;
+
+    String version_;
+    int versionMajor_;
+    int versionMinor_;
 };
 
-PlatformInfo::PlatformInfo()
+PlatformInfo::PlatformInfo() CV_NOEXCEPT
 {
     p = 0;
 }
@@ -6617,6 +6766,23 @@ PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
     return *this;
 }
 
+PlatformInfo::PlatformInfo(PlatformInfo&& i) CV_NOEXCEPT
+{
+    p = i.p;
+    i.p = nullptr;
+}
+
+PlatformInfo& PlatformInfo::operator = (PlatformInfo&& i) CV_NOEXCEPT
+{
+    if (this != &i) {
+        if(p)
+            p->release();
+        p = i.p;
+        i.p = nullptr;
+    }
+    return *this;
+}
+
 int PlatformInfo::deviceNumber() const
 {
     return p ? (int)p->devices.size() : 0;
@@ -6641,7 +6807,19 @@ String PlatformInfo::vendor() const
 
 String PlatformInfo::version() const
 {
-    return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
+    return p ? p->version_ : String();
+}
+
+int PlatformInfo::versionMajor() const
+{
+    CV_Assert(p);
+    return p->versionMajor_;
+}
+
+int PlatformInfo::versionMinor() const
+{
+    CV_Assert(p);
+    return p->versionMinor_;
 }
 
 static void getPlatforms(std::vector<cl_platform_id>& platforms)
@@ -7145,7 +7323,7 @@ struct Image2D::Impl
     cl_mem handle;
 };
 
-Image2D::Image2D()
+Image2D::Image2D() CV_NOEXCEPT
 {
     p = NULL;
 }
@@ -7203,6 +7381,23 @@ Image2D & Image2D::operator = (const Image2D & i)
     return *this;
 }
 
+Image2D::Image2D(Image2D&& i) CV_NOEXCEPT
+{
+    p = i.p;
+    i.p = nullptr;
+}
+
+Image2D& Image2D::operator = (Image2D&& i) CV_NOEXCEPT
+{
+    if (this != &i) {
+        if (p)
+            p->release();
+        p = i.p;
+        i.p = nullptr;
+    }
+    return *this;
+}
+
 Image2D::~Image2D()
 {
     if (p)
diff --git a/modules/core/src/ocl_disabled.impl.hpp b/modules/core/src/ocl_disabled.impl.hpp
index 97c3856b37..b5f9c4f69b 100644
--- a/modules/core/src/ocl_disabled.impl.hpp
+++ b/modules/core/src/ocl_disabled.impl.hpp
@@ -34,10 +34,12 @@ CV_EXPORTS_W void finish() { /* nothing */ }
 
 CV_EXPORTS bool haveSVM() { return false; }
 
-Device::Device() : p(NULL) { }
+Device::Device() CV_NOEXCEPT : p(NULL) { }
 Device::Device(void* d) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Device::Device(const Device& d) : p(NULL) { }
 Device& Device::operator=(const Device& d) { return *this; }
+Device::Device(Device&&) CV_NOEXCEPT : p(NULL) { }
+Device& Device::operator=(Device&&) CV_NOEXCEPT { return *this; }
 Device::~Device() { }
 
 void Device::set(void* d) { OCL_NOT_AVAILABLE(); }
@@ -147,11 +149,13 @@ const Device& Device::getDefault()
 /* static */ Device Device::fromHandle(void* d) { OCL_NOT_AVAILABLE(); }
 
 
-Context::Context() : p(NULL) { }
+Context::Context() CV_NOEXCEPT : p(NULL) { }
 Context::Context(int dtype) : p(NULL) { }
 Context::~Context() { }
 Context::Context(const Context& c) : p(NULL) { }
 Context& Context::operator=(const Context& c) { return *this; }
+Context::Context(Context&&) CV_NOEXCEPT : p(NULL) { }
+Context& Context::operator=(Context&&) CV_NOEXCEPT { return *this; }
 
 bool Context::create() { return false; }
 bool Context::create(int dtype) { return false; }
@@ -178,10 +182,12 @@ void Context::setUseSVM(bool enabled) { }
 void Context::release() { }
 
 
-Platform::Platform() : p(NULL) { }
+Platform::Platform() CV_NOEXCEPT : p(NULL) { }
 Platform::~Platform() { }
 Platform::Platform(const Platform&) : p(NULL) { }
 Platform& Platform::operator=(const Platform&) { return *this; }
+Platform::Platform(Platform&&) CV_NOEXCEPT : p(NULL) { }
+Platform& Platform::operator=(Platform&&) CV_NOEXCEPT { return *this; }
 
 void* Platform::ptr() const { return NULL; }
 
@@ -198,11 +204,13 @@ void convertFromImage(void* cl_mem_image, UMat& dst) { OCL_NOT_AVAILABLE(); }
 
 void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device) { OCL_NOT_AVAILABLE(); }
 
-Queue::Queue() : p(NULL) { }
+Queue::Queue() CV_NOEXCEPT : p(NULL) { }
 Queue::Queue(const Context& c, const Device& d) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Queue::~Queue() { }
 Queue::Queue(const Queue& q) {}
 Queue& Queue::operator=(const Queue& q) { return *this; }
+Queue::Queue(Queue&&) CV_NOEXCEPT : p(NULL) { }
+Queue& Queue::operator=(Queue&&) CV_NOEXCEPT { return *this; }
 
 bool Queue::create(const Context& c, const Device& d) { OCL_NOT_AVAILABLE(); }
 void Queue::finish() {}
@@ -218,7 +226,7 @@ Queue& Queue::getDefault()
 const Queue& Queue::getProfilingQueue() const { OCL_NOT_AVAILABLE(); }
 
 
-KernelArg::KernelArg()
+KernelArg::KernelArg() CV_NOEXCEPT
     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
 {
 }
@@ -235,12 +243,14 @@ KernelArg KernelArg::Constant(const Mat& m)
 }
 
 
-Kernel::Kernel() : p(NULL) { }
+Kernel::Kernel() CV_NOEXCEPT : p(NULL) { }
 Kernel::Kernel(const char* kname, const Program& prog) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Kernel::Kernel(const char* kname, const ProgramSource& prog, const String& buildopts, String* errmsg) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Kernel::~Kernel() { }
 Kernel::Kernel(const Kernel& k) : p(NULL) { }
 Kernel& Kernel::operator=(const Kernel& k) { return *this; }
+Kernel::Kernel(Kernel&&) CV_NOEXCEPT : p(NULL) { }
+Kernel& Kernel::operator=(Kernel&&) CV_NOEXCEPT { return *this; }
 
 bool Kernel::empty() const { return true; }
 bool Kernel::create(const char* kname, const Program& prog) { OCL_NOT_AVAILABLE(); }
@@ -264,10 +274,12 @@ size_t Kernel::localMemSize() const { OCL_NOT_AVAILABLE(); }
 void* Kernel::ptr() const { return NULL; }
 
 
-Program::Program() : p(NULL) { }
+Program::Program() CV_NOEXCEPT : p(NULL) { }
 Program::Program(const ProgramSource& src, const String& buildflags, String& errmsg) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Program::Program(const Program& prog) : p(NULL) { }
 Program& Program::operator=(const Program& prog) { return *this; }
+Program::Program(Program&&) CV_NOEXCEPT : p(NULL) { }
+Program& Program::operator=(Program&&) CV_NOEXCEPT { return *this; }
 Program::~Program() { }
 
 bool Program::create(const ProgramSource& src, const String& buildflags, String& errmsg) { OCL_NOT_AVAILABLE(); }
@@ -283,13 +295,15 @@ String Program::getPrefix() const { OCL_NOT_AVAILABLE(); }
 /* static */ String Program::getPrefix(const String& buildflags) { OCL_NOT_AVAILABLE(); }
 
 
-ProgramSource::ProgramSource() : p(NULL) { }
+ProgramSource::ProgramSource() CV_NOEXCEPT : p(NULL) { }
 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash) : p(NULL) { }
 ProgramSource::ProgramSource(const String& prog) : p(NULL) { }
 ProgramSource::ProgramSource(const char* prog) : p(NULL) { }
 ProgramSource::~ProgramSource() { }
 ProgramSource::ProgramSource(const ProgramSource& prog) : p(NULL) { }
 ProgramSource& ProgramSource::operator=(const ProgramSource& prog) { return *this; }
+ProgramSource::ProgramSource(ProgramSource&&) CV_NOEXCEPT : p(NULL) { }
+ProgramSource& ProgramSource::operator=(ProgramSource&&) CV_NOEXCEPT { return *this; }
 
 const String& ProgramSource::source() const { OCL_NOT_AVAILABLE(); }
 ProgramSource::hash_t ProgramSource::hash() const { OCL_NOT_AVAILABLE(); }
@@ -298,12 +312,14 @@ ProgramSource::hash_t ProgramSource::hash() const { OCL_NOT_AVAILABLE(); }
 /* static */ ProgramSource ProgramSource::fromSPIR(const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions) { OCL_NOT_AVAILABLE(); }
 
 
-PlatformInfo::PlatformInfo() : p(NULL) { }
+PlatformInfo::PlatformInfo() CV_NOEXCEPT : p(NULL) { }
 PlatformInfo::PlatformInfo(void* id) : p(NULL) { OCL_NOT_AVAILABLE(); }
 PlatformInfo::~PlatformInfo() { }
 
 PlatformInfo::PlatformInfo(const PlatformInfo& i) : p(NULL) { }
 PlatformInfo& PlatformInfo::operator=(const PlatformInfo& i) { return *this; }
+PlatformInfo::PlatformInfo(PlatformInfo&&) CV_NOEXCEPT : p(NULL) { }
+PlatformInfo& PlatformInfo::operator=(PlatformInfo&&) CV_NOEXCEPT { return *this; }
 
 String PlatformInfo::name() const { OCL_NOT_AVAILABLE(); }
 String PlatformInfo::vendor() const { OCL_NOT_AVAILABLE(); }
@@ -341,11 +357,13 @@ int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray sr
 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m) { OCL_NOT_AVAILABLE(); }
 
 
-Image2D::Image2D() : p(NULL) { }
+Image2D::Image2D() CV_NOEXCEPT : p(NULL) { }
 Image2D::Image2D(const UMat &src, bool norm, bool alias) { OCL_NOT_AVAILABLE(); }
 Image2D::Image2D(const Image2D & i) : p(NULL) { OCL_NOT_AVAILABLE(); }
 Image2D::~Image2D() { }
 Image2D& Image2D::operator=(const Image2D & i) { return *this; }
+Image2D::Image2D(Image2D&&) CV_NOEXCEPT : p(NULL) { }
+Image2D& Image2D::operator=(Image2D&&) CV_NOEXCEPT { return *this; }
 
 /* static */ bool Image2D::canCreateAlias(const UMat &u) { OCL_NOT_AVAILABLE(); }
 /* static */ bool Image2D::isFormatSupported(int depth, int cn, bool norm) { OCL_NOT_AVAILABLE(); }
diff --git a/modules/core/src/opencl/cvtclr_dx.cl b/modules/core/src/opencl/cvtclr_dx.cl
index 0ca2118c77..5c51077814 100644
--- a/modules/core/src/opencl/cvtclr_dx.cl
+++ b/modules/core/src/opencl/cvtclr_dx.cl
@@ -91,63 +91,50 @@ void YUV2BGR_NV12_8u(
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
+    // each iteration computes 2*2=4 pixels
+    int x2 = x*2;
+    int y2 = y*2;
 
-    if (x + 1 < cols)
-    {
-        if (y + 1 < rows)
-        {
-            __global uchar* pDstRow1 = pBGR + mad24(y, bgrStep, mad24(x, NCHANNELS, 0));
-            __global uchar* pDstRow2 = pDstRow1 + bgrStep;
+    if (x2 + 1 < cols) {
+        if (y2 + 1 < rows) {
+            __global uchar *pDstRow1 = pBGR + mad24(y2, bgrStep, mad24(x2, NCHANNELS, 0));
+            __global uchar *pDstRow2 = pDstRow1 + bgrStep;
 
-            float4 Y1 = read_imagef(imgY, (int2)(x+0, y+0));
-            float4 Y2 = read_imagef(imgY, (int2)(x+1, y+0));
-            float4 Y3 = read_imagef(imgY, (int2)(x+0, y+1));
-            float4 Y4 = read_imagef(imgY, (int2)(x+1, y+1));
+            float4 Y1 = read_imagef(imgY, (int2)(x2 + 0, y2 + 0));
+            float4 Y2 = read_imagef(imgY, (int2)(x2 + 1, y2 + 0));
+            float4 Y3 = read_imagef(imgY, (int2)(x2 + 0, y2 + 1));
+            float4 Y4 = read_imagef(imgY, (int2)(x2 + 1, y2 + 1));
+            float4 Y = (float4)(Y1.x, Y2.x, Y3.x, Y4.x);
 
-            float4 UV = read_imagef(imgUV, (int2)(x/2, y/2)) - d2;
+            float4 UV = read_imagef(imgUV, (int2)(x, y)) - d2;
 
-            __constant float* coeffs = c_YUV2RGBCoeffs_420;
+            __constant float *coeffs = c_YUV2RGBCoeffs_420;
 
-            Y1 = max(0.f, Y1 - d1) * coeffs[0];
-            Y2 = max(0.f, Y2 - d1) * coeffs[0];
-            Y3 = max(0.f, Y3 - d1) * coeffs[0];
-            Y4 = max(0.f, Y4 - d1) * coeffs[0];
+            Y = max(0.f, Y - d1) * coeffs[0];
 
             float ruv = fma(coeffs[4], UV.y, 0.0f);
             float guv = fma(coeffs[3], UV.y, fma(coeffs[2], UV.x, 0.0f));
             float buv = fma(coeffs[1], UV.x, 0.0f);
 
-            float R1 = (Y1.x + ruv) * CV_8U_MAX;
-            float G1 = (Y1.x + guv) * CV_8U_MAX;
-            float B1 = (Y1.x + buv) * CV_8U_MAX;
+            float4 R = (Y + ruv) * CV_8U_MAX;
+            float4 G = (Y + guv) * CV_8U_MAX;
+            float4 B = (Y + buv) * CV_8U_MAX;
 
-            float R2 = (Y2.x + ruv) * CV_8U_MAX;
-            float G2 = (Y2.x + guv) * CV_8U_MAX;
-            float B2 = (Y2.x + buv) * CV_8U_MAX;
+            pDstRow1[0*NCHANNELS + 0] = convert_uchar_sat(B.x);
+            pDstRow1[0*NCHANNELS + 1] = convert_uchar_sat(G.x);
+            pDstRow1[0*NCHANNELS + 2] = convert_uchar_sat(R.x);
 
-            float R3 = (Y3.x + ruv) * CV_8U_MAX;
-            float G3 = (Y3.x + guv) * CV_8U_MAX;
-            float B3 = (Y3.x + buv) * CV_8U_MAX;
+            pDstRow1[1*NCHANNELS + 0] = convert_uchar_sat(B.y);
+            pDstRow1[1*NCHANNELS + 1] = convert_uchar_sat(G.y);
+            pDstRow1[1*NCHANNELS + 2] = convert_uchar_sat(R.y);
 
-            float R4 = (Y4.x + ruv) * CV_8U_MAX;
-            float G4 = (Y4.x + guv) * CV_8U_MAX;
-            float B4 = (Y4.x + buv) * CV_8U_MAX;
+            pDstRow2[0*NCHANNELS + 0] = convert_uchar_sat(B.z);
+            pDstRow2[0*NCHANNELS + 1] = convert_uchar_sat(G.z);
+            pDstRow2[0*NCHANNELS + 2] = convert_uchar_sat(R.z);
 
-            pDstRow1[0*NCHANNELS + 0] = convert_uchar_sat(B1);
-            pDstRow1[0*NCHANNELS + 1] = convert_uchar_sat(G1);
-            pDstRow1[0*NCHANNELS + 2] = convert_uchar_sat(R1);
-
-            pDstRow1[1*NCHANNELS + 0] = convert_uchar_sat(B2);
-            pDstRow1[1*NCHANNELS + 1] = convert_uchar_sat(G2);
-            pDstRow1[1*NCHANNELS + 2] = convert_uchar_sat(R2);
-
-            pDstRow2[0*NCHANNELS + 0] = convert_uchar_sat(B3);
-            pDstRow2[0*NCHANNELS + 1] = convert_uchar_sat(G3);
-            pDstRow2[0*NCHANNELS + 2] = convert_uchar_sat(R3);
-
-            pDstRow2[1*NCHANNELS + 0] = convert_uchar_sat(B4);
-            pDstRow2[1*NCHANNELS + 1] = convert_uchar_sat(G4);
-            pDstRow2[1*NCHANNELS + 2] = convert_uchar_sat(R4);
+            pDstRow2[1*NCHANNELS + 0] = convert_uchar_sat(B.w);
+            pDstRow2[1*NCHANNELS + 1] = convert_uchar_sat(G.w);
+            pDstRow2[1*NCHANNELS + 2] = convert_uchar_sat(R.w);
         }
     }
 }
@@ -172,12 +159,15 @@ void BGR2YUV_NV12_8u(
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
+    // each iteration computes 2*2=4 pixels
+    int x2 = x*2;
+    int y2 = y*2;
 
-    if (x < cols)
+    if (x2 + 1 < cols)
     {
-        if (y < rows)
+        if (y2 + 1 < rows)
         {
-            __global const uchar* pSrcRow1 = pBGR + mad24(y, bgrStep, mad24(x, NCHANNELS, 0));
+            __global const uchar* pSrcRow1 = pBGR + mad24(y2, bgrStep, mad24(x2, NCHANNELS, 0));
             __global const uchar* pSrcRow2 = pSrcRow1 + bgrStep;
 
             float4 src_pix1 = convert_float4(vload4(0, pSrcRow1 + 0*NCHANNELS)) * CV_8U_SCALE;
@@ -196,12 +186,12 @@ void BGR2YUV_NV12_8u(
             UV.x = fma(coeffs[3], src_pix1.z, fma(coeffs[4], src_pix1.y, fma(coeffs[5], src_pix1.x, d2)));
             UV.y = fma(coeffs[5], src_pix1.z, fma(coeffs[6], src_pix1.y, fma(coeffs[7], src_pix1.x, d2)));
 
-            write_imagef(imgY, (int2)(x+0, y+0), Y1);
-            write_imagef(imgY, (int2)(x+1, y+0), Y2);
-            write_imagef(imgY, (int2)(x+0, y+1), Y3);
-            write_imagef(imgY, (int2)(x+1, y+1), Y4);
+            write_imagef(imgY, (int2)(x2+0, y2+0), Y1);
+            write_imagef(imgY, (int2)(x2+1, y2+0), Y2);
+            write_imagef(imgY, (int2)(x2+0, y2+1), Y3);
+            write_imagef(imgY, (int2)(x2+1, y2+1), Y4);
 
-            write_imagef(imgUV, (int2)((x/2), (y/2)), UV);
+            write_imagef(imgUV, (int2)(x, y), UV);
         }
     }
 }
diff --git a/modules/core/src/opencl/halfconvert.cl b/modules/core/src/opencl/halfconvert.cl
index 506df69faf..9df602f406 100644
--- a/modules/core/src/opencl/halfconvert.cl
+++ b/modules/core/src/opencl/halfconvert.cl
@@ -47,8 +47,17 @@
 #endif
 #endif
 
-__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
-                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+__kernel void
+#ifdef FLOAT_TO_HALF
+    convertFp16_FP32_to_FP16
+#else
+    convertFp16_FP16_to_FP32
+#endif
+(
+    __global const uchar * srcptr, int src_step, int src_offset,
+    __global uchar * dstptr, int dst_step, int dst_offset,
+    int dst_rows, int dst_cols
+)
 {
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
diff --git a/modules/core/src/opengl.cpp b/modules/core/src/opengl.cpp
index 5ff3c717b6..ab39b1b8ac 100644
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@@ -1575,6 +1575,7 @@ void cv::ogl::render(const ogl::Arrays& arr, InputArray indices, int mode, Scala
 // CL-GL Interoperability
 
 #ifdef HAVE_OPENCL
+#  include "opencv2/core/opencl/runtime/opencl_core.hpp"
 #  include "opencv2/core/opencl/runtime/opencl_gl.hpp"
 #  ifdef cl_khr_gl_sharing
 #    define HAVE_OPENCL_OPENGL_SHARING
@@ -1595,6 +1596,34 @@ void cv::ogl::render(const ogl::Arrays& arr, InputArray indices, int mode, Scala
 
 namespace cv { namespace ogl {
 
+#if defined(HAVE_OPENCL) && defined(HAVE_OPENGL) && defined(HAVE_OPENCL_OPENGL_SHARING)
+// Check to avoid crash in OpenCL runtime: https://github.com/opencv/opencv/issues/5209
+static void checkOpenCLVersion()
+{
+    using namespace cv::ocl;
+    const Device& device = Device::getDefault();
+    //CV_Assert(!device.empty());
+    cl_device_id dev = (cl_device_id)device.ptr();
+    CV_Assert(dev);
+
+    cl_platform_id platform_id = 0;
+    size_t sz = 0;
+
+    cl_int status = clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, &sz);
+    CV_Assert(status == CL_SUCCESS && sz == sizeof(cl_platform_id));
+    CV_Assert(platform_id);
+
+    PlatformInfo pi(&platform_id);
+    int versionMajor = pi.versionMajor();
+    int versionMinor = pi.versionMinor();
+    if (versionMajor < 1 || (versionMajor == 1 && versionMinor <= 1))
+        CV_Error_(cv::Error::OpenCLApiCallError,
+            ("OpenCL: clCreateFromGLTexture requires OpenCL 1.2+ version: %d.%d - %s (%s)",
+                versionMajor, versionMinor, pi.name().c_str(), pi.version().c_str())
+        );
+}
+#endif
+
 namespace ocl {
 
 Context& initializeContextFromGL()
@@ -1719,6 +1748,8 @@ void convertToGLTexture2D(InputArray src, Texture2D& texture)
     Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
+    checkOpenCLVersion();  // clCreateFromGLTexture requires OpenCL 1.2
+
     UMat u = src.getUMat();
 
     // TODO Add support for roi
@@ -1777,6 +1808,8 @@ void convertFromGLTexture2D(const Texture2D& texture, OutputArray dst)
     Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
+    checkOpenCLVersion();  // clCreateFromGLTexture requires OpenCL 1.2
+
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
     dst.create(texture.size(), textureType);
     UMat u = dst.getUMat();
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 9dc0fd00f0..879d80cdb1 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -45,6 +45,9 @@
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/trace.private.hpp>
 
+#include "opencv2/core/parallel/parallel_backend.hpp"
+#include "parallel/parallel.hpp"
+
 #if defined _WIN32 || defined WINCE
     #include <windows.h>
     #undef small
@@ -101,7 +104,6 @@
     #endif
     #include "tbb/tbb.h"
     #include "tbb/task.h"
-    #include "tbb/tbb_stddef.h"
     #if TBB_INTERFACE_VERSION >= 8000
         #include "tbb/task_arena.h"
     #endif
@@ -145,9 +147,7 @@
 #  define CV_PARALLEL_FRAMEWORK "pthreads"
 #endif
 
-#ifdef CV_PARALLEL_FRAMEWORK
 #include <atomic>
-#endif
 
 #include "parallel_impl.hpp"
 
@@ -159,9 +159,10 @@ namespace cv {
 
 ParallelLoopBody::~ParallelLoopBody() {}
 
+using namespace cv::parallel;
+
 namespace {
 
-#ifdef CV_PARALLEL_FRAMEWORK
 #ifdef ENABLE_INSTRUMENTATION
     static void SyncNodes(cv::instr::InstrNode *pNode)
     {
@@ -430,8 +431,6 @@ namespace {
     typedef ParallelLoopBodyWrapper ProxyLoopBody;
 #endif
 
-static int numThreads = -1;
-
 #if defined HAVE_TBB
     #if TBB_INTERFACE_VERSION >= 8000
         static tbb::task_arena tbbArena(tbb::task_arena::automatic);
@@ -446,7 +445,7 @@ static inline int _initMaxThreads()
     int maxThreads = omp_get_max_threads();
     if (!utils::getConfigurationParameterBool("OPENCV_FOR_OPENMP_DYNAMIC_DISABLE", false))
     {
-        omp_set_dynamic(maxThreads);
+        omp_set_dynamic(1);
     }
     return maxThreads;
 }
@@ -477,15 +476,11 @@ static SchedPtr pplScheduler;
 
 #endif
 
-#endif // CV_PARALLEL_FRAMEWORK
-
 } // namespace anon
 
 /* ================================   parallel_for_  ================================ */
 
-#ifdef CV_PARALLEL_FRAMEWORK
 static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes); // forward declaration
-#endif
 
 void parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
 {
@@ -500,7 +495,6 @@ void parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, dou
     if (range.empty())
         return;
 
-#ifdef CV_PARALLEL_FRAMEWORK
     static std::atomic<bool> flagNestedParallelFor(false);
     bool isNotNestedRegion = !flagNestedParallelFor.load();
     if (isNotNestedRegion)
@@ -519,16 +513,23 @@ void parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, dou
         }
     }
     else // nested parallel_for_() calls are not parallelized
-#endif // CV_PARALLEL_FRAMEWORK
     {
         CV_UNUSED(nstripes);
         body(range);
     }
 }
 
-#ifdef CV_PARALLEL_FRAMEWORK
+static
+void parallel_for_cb(int start, int end, void* data)
+{
+    CV_DbgAssert(data);
+    const cv::ParallelLoopBody& body = *(const cv::ParallelLoopBody*)data;
+    body(Range(start, end));
+}
+
 static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
 {
+    using namespace cv::parallel;
     if ((numThreads < 0 || numThreads > 1) && range.end - range.start > 1)
     {
         ParallelLoopBodyWrapperContext ctx(body, range, nstripes);
@@ -540,6 +541,16 @@ static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody
             return;
         }
 
+        std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
+        if (api)
+        {
+            CV_CheckEQ(stripeRange.start, 0, "");
+            api->parallel_for(stripeRange.end, parallel_for_cb, (void*)&pbody);
+            ctx.finalize();  // propagate exceptions if exists
+            return;
+        }
+
+#ifdef CV_PARALLEL_FRAMEWORK
 #if defined HAVE_TBB
 
 #if TBB_INTERFACE_VERSION >= 8000
@@ -590,24 +601,25 @@ static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody
 #endif
 
         ctx.finalize();  // propagate exceptions if exists
-    }
-    else
-    {
-        body(range);
-    }
-}
+        return;
 #endif // CV_PARALLEL_FRAMEWORK
+    }
+
+    body(range);
+}
 
 
 int getNumThreads(void)
 {
-#ifdef CV_PARALLEL_FRAMEWORK
+    std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
+    if (api)
+    {
+        return api->getNumThreads();
+    }
 
-    if(numThreads == 0)
+    if (numThreads == 0)
         return 1;
 
-#endif
-
 #if defined HAVE_TBB
 
 #if TBB_INTERFACE_VERSION >= 9100
@@ -682,10 +694,15 @@ unsigned defaultNumberOfThreads()
 void setNumThreads( int threads_ )
 {
     CV_UNUSED(threads_);
-#ifdef CV_PARALLEL_FRAMEWORK
+
     int threads = (threads_ < 0) ? defaultNumberOfThreads() : (unsigned)threads_;
     numThreads = threads;
-#endif
+
+    std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
+    if (api)
+    {
+        api->setNumThreads(numThreads);
+    }
 
 #ifdef HAVE_TBB
 
@@ -741,6 +758,12 @@ void setNumThreads( int threads_ )
 
 int getThreadNum()
 {
+    std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
+    if (api)
+    {
+        return api->getThreadNum();
+    }
+
 #if defined HAVE_TBB
     #if TBB_INTERFACE_VERSION >= 9100
         return tbb::this_task_arena::current_thread_index();
@@ -963,7 +986,13 @@ int getNumberOfCPUs()
     return nCPUs;  // cached value
 }
 
-const char* currentParallelFramework() {
+const char* currentParallelFramework()
+{
+    std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
+    if (api)
+    {
+        return api->getName();
+    }
 #ifdef CV_PARALLEL_FRAMEWORK
     return CV_PARALLEL_FRAMEWORK;
 #else
diff --git a/modules/core/src/parallel/factory_parallel.hpp b/modules/core/src/parallel/factory_parallel.hpp
new file mode 100644
index 0000000000..693fe30ecf
--- /dev/null
+++ b/modules/core/src/parallel/factory_parallel.hpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FACTORY_HPP
+#define OPENCV_CORE_PARALLEL_FACTORY_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+namespace cv { namespace parallel {
+
+class IParallelBackendFactory
+{
+public:
+    virtual ~IParallelBackendFactory() {}
+    virtual std::shared_ptr<cv::parallel::ParallelForAPI> create() const = 0;
+};
+
+
+class StaticBackendFactory CV_FINAL: public IParallelBackendFactory
+{
+protected:
+    std::function<std::shared_ptr<cv::parallel::ParallelForAPI>(void)> create_fn_;
+
+public:
+    StaticBackendFactory(std::function<std::shared_ptr<cv::parallel::ParallelForAPI>(void)>&& create_fn)
+        : create_fn_(create_fn)
+    {
+        // nothing
+    }
+
+    ~StaticBackendFactory() CV_OVERRIDE {}
+
+    std::shared_ptr<cv::parallel::ParallelForAPI> create() const CV_OVERRIDE
+    {
+        return create_fn_();
+    }
+};
+
+//
+// PluginBackendFactory is implemented in plugin_wrapper.cpp
+//
+
+std::shared_ptr<IParallelBackendFactory> createPluginParallelBackendFactory(const std::string& baseName);
+
+}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FACTORY_HPP
diff --git a/modules/core/src/parallel/parallel.cpp b/modules/core/src/parallel/parallel.cpp
new file mode 100644
index 0000000000..29b482f5f3
--- /dev/null
+++ b/modules/core/src/parallel/parallel.cpp
@@ -0,0 +1,177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+#include "parallel.hpp"
+
+#include <opencv2/core/utils/configuration.private.hpp>
+#include <opencv2/core/utils/logger.defines.hpp>
+#ifdef NDEBUG
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#else
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
+#endif
+#include <opencv2/core/utils/logger.hpp>
+
+
+#include "registry_parallel.hpp"
+#include "registry_parallel.impl.hpp"
+
+#include "plugin_parallel_api.hpp"
+#include "plugin_parallel_wrapper.impl.hpp"
+
+
+namespace cv { namespace parallel {
+
+int numThreads = -1;
+
+ParallelForAPI::~ParallelForAPI()
+{
+    // nothing
+}
+
+static
+std::string& getParallelBackendName()
+{
+    static std::string g_backendName = toUpperCase(cv::utils::getConfigurationParameterString("OPENCV_PARALLEL_BACKEND", ""));
+    return g_backendName;
+}
+
+static bool g_initializedParallelForAPI = false;
+
+static
+std::shared_ptr<ParallelForAPI> createParallelForAPI()
+{
+    const std::string& name = getParallelBackendName();
+    bool isKnown = false;
+    const auto& backends = getParallelBackendsInfo();
+    if (!name.empty())
+    {
+        CV_LOG_INFO(NULL, "core(parallel): requested backend name: " << name);
+    }
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const auto& info = backends[i];
+        if (!name.empty())
+        {
+            if (name != info.name)
+            {
+                continue;
+            }
+            isKnown = true;
+        }
+        try
+        {
+            CV_LOG_DEBUG(NULL, "core(parallel): trying backend: " << info.name << " (priority=" << info.priority << ")");
+            if (!info.backendFactory)
+            {
+                CV_LOG_DEBUG(NULL, "core(parallel): factory is not available (plugins require filesystem support): " << info.name);
+                continue;
+            }
+            std::shared_ptr<ParallelForAPI> backend = info.backendFactory->create();
+            if (!backend)
+            {
+                CV_LOG_VERBOSE(NULL, 0, "core(parallel): not available: " << info.name);
+                continue;
+            }
+            CV_LOG_INFO(NULL, "core(parallel): using backend: " << info.name << " (priority=" << info.priority << ")");
+            g_initializedParallelForAPI = true;
+            getParallelBackendName() = info.name;
+            return backend;
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_WARNING(NULL, "core(parallel): can't initialize " << info.name << " backend: " << e.what());
+        }
+        catch (...)
+        {
+            CV_LOG_WARNING(NULL, "core(parallel): can't initialize " << info.name << " backend: Unknown C++ exception");
+        }
+    }
+    if (name.empty())
+    {
+        CV_LOG_DEBUG(NULL, "core(parallel): fallback on builtin code");
+    }
+    else
+    {
+        if (!isKnown)
+            CV_LOG_INFO(NULL, "core(parallel): unknown backend: " << name);
+    }
+    g_initializedParallelForAPI = true;
+    return std::shared_ptr<ParallelForAPI>();
+}
+
+static inline
+std::shared_ptr<ParallelForAPI> createDefaultParallelForAPI()
+{
+    CV_LOG_DEBUG(NULL, "core(parallel): Initializing parallel backend...");
+    return createParallelForAPI();
+}
+
+std::shared_ptr<ParallelForAPI>& getCurrentParallelForAPI()
+{
+    static std::shared_ptr<ParallelForAPI> g_currentParallelForAPI = createDefaultParallelForAPI();
+    return g_currentParallelForAPI;
+}
+
+void setParallelForBackend(const std::shared_ptr<ParallelForAPI>& api, bool propagateNumThreads)
+{
+    getCurrentParallelForAPI() = api;
+    if (propagateNumThreads && api)
+    {
+        setNumThreads(numThreads);
+    }
+}
+
+bool setParallelForBackend(const std::string& backendName, bool propagateNumThreads)
+{
+    CV_TRACE_FUNCTION();
+
+    std::string backendName_u = toUpperCase(backendName);
+    if (g_initializedParallelForAPI)
+    {
+        // ... already initialized
+        if (getParallelBackendName() == backendName_u)
+        {
+            CV_LOG_INFO(NULL, "core(parallel): backend is already activated: " << (backendName.empty() ? "builtin(legacy)" : backendName));
+            return true;
+        }
+        else
+        {
+            // ... re-create new
+            CV_LOG_DEBUG(NULL, "core(parallel): replacing parallel backend...");
+            getParallelBackendName() = backendName_u;
+            getCurrentParallelForAPI() = createParallelForAPI();
+        }
+    }
+    else
+    {
+        // ... no backend exists, just specify the name (initialization is triggered by getCurrentParallelForAPI() call)
+        getParallelBackendName() = backendName_u;
+    }
+    std::shared_ptr<ParallelForAPI> api = getCurrentParallelForAPI();
+    if (!api)
+    {
+        if (!backendName.empty())
+        {
+            CV_LOG_WARNING(NULL, "core(parallel): backend is not available: " << backendName << " (using builtin legacy code)");
+            return false;
+        }
+        else
+        {
+            CV_LOG_WARNING(NULL, "core(parallel): switched to builtin code (legacy)");
+        }
+    }
+    if (!backendName_u.empty())
+    {
+        CV_Assert(backendName_u == getParallelBackendName());  // data race?
+    }
+
+    if (propagateNumThreads)
+    {
+        setNumThreads(numThreads);
+    }
+    return true;
+}
+
+}}  // namespace
diff --git a/modules/core/src/parallel/parallel.hpp b/modules/core/src/parallel/parallel.hpp
new file mode 100644
index 0000000000..b6a54b14e7
--- /dev/null
+++ b/modules/core/src/parallel/parallel.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_CORE_SRC_PARALLEL_PARALLEL_HPP
+#define OPENCV_CORE_SRC_PARALLEL_PARALLEL_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+namespace cv { namespace parallel {
+
+extern int numThreads;
+
+std::shared_ptr<ParallelForAPI>& getCurrentParallelForAPI();
+
+#ifndef BUILD_PLUGIN
+
+#ifdef HAVE_TBB
+std::shared_ptr<cv::parallel::ParallelForAPI> createParallelBackendTBB();
+#endif
+
+#ifdef HAVE_OPENMP
+std::shared_ptr<cv::parallel::ParallelForAPI> createParallelBackendOpenMP();
+#endif
+
+#endif  // BUILD_PLUGIN
+
+}}  // namespace
+
+#endif // OPENCV_CORE_SRC_PARALLEL_PARALLEL_HPP
diff --git a/modules/core/src/parallel/parallel_openmp.cpp b/modules/core/src/parallel/parallel_openmp.cpp
new file mode 100644
index 0000000000..c0010dd845
--- /dev/null
+++ b/modules/core/src/parallel/parallel_openmp.cpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+
+#ifdef HAVE_OPENMP
+
+#include "parallel.hpp"
+#include "opencv2/core/parallel/backend/parallel_for.openmp.hpp"
+
+namespace cv { namespace parallel {
+
+static
+std::shared_ptr<cv::parallel::openmp::ParallelForBackend>& getInstance()
+{
+    static std::shared_ptr<cv::parallel::openmp::ParallelForBackend> g_instance = std::make_shared<cv::parallel::openmp::ParallelForBackend>();
+    return g_instance;
+}
+
+#ifndef BUILD_PLUGIN
+std::shared_ptr<cv::parallel::ParallelForAPI> createParallelBackendOpenMP()
+{
+    return getInstance();
+}
+#endif
+
+}}  // namespace
+
+#ifdef BUILD_PLUGIN
+
+#define ABI_VERSION 0
+#define API_VERSION 0
+#include "plugin_parallel_api.hpp"
+
+static
+CvResult cv_getInstance(CV_OUT CvPluginParallelBackendAPI* handle) CV_NOEXCEPT
+{
+    try
+    {
+        if (!handle)
+            return CV_ERROR_FAIL;
+        *handle = cv::parallel::getInstance().get();
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
+}
+
+static const OpenCV_Core_Parallel_Plugin_API plugin_api =
+{
+    {
+        sizeof(OpenCV_Core_Parallel_Plugin_API), ABI_VERSION, API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "OpenMP (" CVAUX_STR(_OPENMP) ") OpenCV parallel plugin"
+    },
+    {
+        /*  1*/cv_getInstance
+    }
+};
+
+const OpenCV_Core_Parallel_Plugin_API* CV_API_CALL opencv_core_parallel_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
+}
+
+#endif  // BUILD_PLUGIN
+
+#endif  // HAVE_TBB
diff --git a/modules/core/src/parallel/parallel_tbb.cpp b/modules/core/src/parallel/parallel_tbb.cpp
new file mode 100644
index 0000000000..d430e858e6
--- /dev/null
+++ b/modules/core/src/parallel/parallel_tbb.cpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+
+#include "factory_parallel.hpp"
+
+#ifdef HAVE_TBB
+
+#include "parallel.hpp"
+#include "opencv2/core/parallel/backend/parallel_for.tbb.hpp"
+
+namespace cv { namespace parallel {
+
+static
+std::shared_ptr<cv::parallel::tbb::ParallelForBackend>& getInstance()
+{
+    static std::shared_ptr<cv::parallel::tbb::ParallelForBackend> g_instance = std::make_shared<cv::parallel::tbb::ParallelForBackend>();
+    return g_instance;
+}
+
+#ifndef BUILD_PLUGIN
+std::shared_ptr<cv::parallel::ParallelForAPI> createParallelBackendTBB()
+{
+    return getInstance();
+}
+#endif
+
+}}  // namespace
+
+#ifdef BUILD_PLUGIN
+
+#define ABI_VERSION 0
+#define API_VERSION 0
+#include "plugin_parallel_api.hpp"
+
+static
+CvResult cv_getInstance(CV_OUT CvPluginParallelBackendAPI* handle) CV_NOEXCEPT
+{
+    try
+    {
+        if (!handle)
+            return CV_ERROR_FAIL;
+        *handle = cv::parallel::getInstance().get();
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
+}
+
+static const OpenCV_Core_Parallel_Plugin_API plugin_api =
+{
+    {
+        sizeof(OpenCV_Core_Parallel_Plugin_API), ABI_VERSION, API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "TBB (interface " CVAUX_STR(TBB_INTERFACE_VERSION) ") OpenCV parallel plugin"
+    },
+    {
+        /*  1*/cv_getInstance
+    }
+};
+
+const OpenCV_Core_Parallel_Plugin_API* CV_API_CALL opencv_core_parallel_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
+}
+
+#endif  // BUILD_PLUGIN
+
+#endif  // HAVE_TBB
diff --git a/modules/core/src/parallel/plugin_parallel_api.hpp b/modules/core/src/parallel/plugin_parallel_api.hpp
new file mode 100644
index 0000000000..bdc28d6de0
--- /dev/null
+++ b/modules/core/src/parallel/plugin_parallel_api.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef PARALLEL_PLUGIN_API_HPP
+#define PARALLEL_PLUGIN_API_HPP
+
+#include <opencv2/core/cvdef.h>
+#include <opencv2/core/llapi/llapi.h>
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+#if !defined(BUILD_PLUGIN)
+
+/// increased for backward-compatible changes, e.g. add new function
+/// Caller API <= Plugin API -> plugin is fully compatible
+/// Caller API > Plugin API -> plugin is not fully compatible, caller should use extra checks to use plugins with older API
+#define API_VERSION 0 // preview
+
+/// increased for incompatible changes, e.g. remove function argument
+/// Caller ABI == Plugin ABI -> plugin is compatible
+/// Caller ABI > Plugin ABI -> plugin is not compatible, caller should use shim code to use old ABI plugins (caller may know how lower ABI works, so it is possible)
+/// Caller ABI < Plugin ABI -> plugin can't be used (plugin should provide interface with lower ABI to handle that)
+#define ABI_VERSION 0 // preview
+
+#else // !defined(BUILD_PLUGIN)
+
+#if !defined(ABI_VERSION) || !defined(API_VERSION)
+#error "Plugin must define ABI_VERSION and API_VERSION before including parallel_plugin_api.hpp"
+#endif
+
+#endif // !defined(BUILD_PLUGIN)
+
+typedef cv::parallel::ParallelForAPI* CvPluginParallelBackendAPI;
+
+struct OpenCV_Core_Parallel_Plugin_API_v0_0_api_entries
+{
+    /** @brief Get parallel backend API instance
+
+    @param[out] handle pointer on backend API handle
+
+    @note API-CALL 1, API-Version == 0
+     */
+    CvResult (CV_API_CALL *getInstance)(CV_OUT CvPluginParallelBackendAPI* handle) CV_NOEXCEPT;
+}; // OpenCV_Core_Parallel_Plugin_API_v0_0_api_entries
+
+typedef struct OpenCV_Core_Parallel_Plugin_API_v0
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_Core_Parallel_Plugin_API_v0_0_api_entries v0;
+} OpenCV_Core_Parallel_Plugin_API_v0;
+
+#if ABI_VERSION == 0 && API_VERSION == 0
+typedef OpenCV_Core_Parallel_Plugin_API_v0 OpenCV_Core_Parallel_Plugin_API;
+#else
+#error "Not supported configuration: check ABI_VERSION/API_VERSION"
+#endif
+
+#ifdef BUILD_PLUGIN
+extern "C" {
+
+CV_PLUGIN_EXPORTS
+const OpenCV_Core_Parallel_Plugin_API* CV_API_CALL opencv_core_parallel_plugin_init_v0
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/) CV_NOEXCEPT;
+
+}  // extern "C"
+#else  // BUILD_PLUGIN
+typedef const OpenCV_Core_Parallel_Plugin_API* (CV_API_CALL *FN_opencv_core_parallel_plugin_init_t)
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/);
+#endif  // BUILD_PLUGIN
+
+#endif // PARALLEL_PLUGIN_API_HPP
diff --git a/modules/core/src/parallel/plugin_parallel_wrapper.impl.hpp b/modules/core/src/parallel/plugin_parallel_wrapper.impl.hpp
new file mode 100644
index 0000000000..a5649b60c0
--- /dev/null
+++ b/modules/core/src/parallel/plugin_parallel_wrapper.impl.hpp
@@ -0,0 +1,287 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of parallel.cpp
+//
+
+//==================================================================================================
+// Dynamic backend implementation
+
+#include "opencv2/core/utils/plugin_loader.private.hpp"
+
+namespace cv { namespace impl {
+
+using namespace cv::parallel;
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(PARALLEL_ENABLE_PLUGINS)
+
+using namespace cv::plugin::impl;  // plugin_loader.hpp
+
+class PluginParallelBackend CV_FINAL: public std::enable_shared_from_this<PluginParallelBackend>
+{
+protected:
+    void initPluginAPI()
+    {
+        const char* init_name = "opencv_core_parallel_plugin_init_v0";
+        FN_opencv_core_parallel_plugin_init_t fn_init = reinterpret_cast<FN_opencv_core_parallel_plugin_init_t>(lib_->getSymbol(init_name));
+        if (fn_init)
+        {
+            CV_LOG_DEBUG(NULL, "Found entry: '" << init_name << "'");
+            for (int supported_api_version = API_VERSION; supported_api_version >= 0; supported_api_version--)
+            {
+                plugin_api_ = fn_init(ABI_VERSION, supported_api_version, NULL);
+                if (plugin_api_)
+                    break;
+            }
+            if (!plugin_api_)
+            {
+                CV_LOG_INFO(NULL, "core(parallel): plugin is incompatible (can't be initialized): " << lib_->getName());
+                return;
+            }
+            if (!checkCompatibility(plugin_api_->api_header, ABI_VERSION, API_VERSION, false))
+            {
+                plugin_api_ = NULL;
+                return;
+            }
+            CV_LOG_INFO(NULL, "core(parallel): plugin is ready to use '" << plugin_api_->api_header.api_description << "'");
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "core(parallel): plugin is incompatible, missing init function: '" << init_name << "', file: " << lib_->getName());
+        }
+    }
+
+
+    bool checkCompatibility(const OpenCV_API_Header& api_header, unsigned int abi_version, unsigned int api_version, bool checkMinorOpenCVVersion)
+    {
+        if (api_header.opencv_version_major != CV_VERSION_MAJOR)
+        {
+            CV_LOG_ERROR(NULL, "core(parallel): wrong OpenCV major version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+        if (!checkMinorOpenCVVersion)
+        {
+            // no checks for OpenCV minor version
+        }
+        else if (api_header.opencv_version_minor != CV_VERSION_MINOR)
+        {
+            CV_LOG_ERROR(NULL, "core(parallel): wrong OpenCV minor version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+        CV_LOG_DEBUG(NULL, "core(parallel): initialized '" << api_header.api_description << "': built with "
+            << cv::format("OpenCV %d.%d (ABI/API = %d/%d)",
+                 api_header.opencv_version_major, api_header.opencv_version_minor,
+                 api_header.min_api_version, api_header.api_version)
+            << ", current OpenCV version is '" CV_VERSION "' (ABI/API = " << abi_version << "/" << api_version << ")"
+        );
+        if (api_header.min_api_version != abi_version)  // future: range can be here
+        {
+            // actually this should never happen due to checks in plugin's init() function
+            CV_LOG_ERROR(NULL, "core(parallel): plugin is not supported due to incompatible ABI = " << api_header.min_api_version);
+            return false;
+        }
+        if (api_header.api_version != api_version)
+        {
+            CV_LOG_INFO(NULL, "core(parallel): NOTE: plugin is supported, but there is API version mismath: "
+                << cv::format("plugin API level (%d) != OpenCV API level (%d)", api_header.api_version, api_version));
+            if (api_header.api_version < api_version)
+            {
+                CV_LOG_INFO(NULL, "core(parallel): NOTE: some functionality may be unavailable due to lack of support by plugin implementation");
+            }
+        }
+        return true;
+    }
+
+public:
+    std::shared_ptr<cv::plugin::impl::DynamicLib> lib_;
+    const OpenCV_Core_Parallel_Plugin_API* plugin_api_;
+
+    PluginParallelBackend(const std::shared_ptr<cv::plugin::impl::DynamicLib>& lib)
+        : lib_(lib)
+        , plugin_api_(NULL)
+    {
+        initPluginAPI();
+    }
+
+    std::shared_ptr<cv::parallel::ParallelForAPI> create() const
+    {
+        CV_Assert(plugin_api_);
+
+        CvPluginParallelBackendAPI instancePtr = NULL;
+
+        if (plugin_api_->v0.getInstance)
+        {
+            if (CV_ERROR_OK == plugin_api_->v0.getInstance(&instancePtr))
+            {
+                CV_Assert(instancePtr);
+                // TODO C++20 "aliasing constructor"
+                return std::shared_ptr<cv::parallel::ParallelForAPI>(instancePtr, [](cv::parallel::ParallelForAPI*){});  // empty deleter
+            }
+        }
+        return std::shared_ptr<cv::parallel::ParallelForAPI>();
+    }
+};
+
+
+class PluginParallelBackendFactory CV_FINAL: public IParallelBackendFactory
+{
+public:
+    std::string baseName_;
+    std::shared_ptr<PluginParallelBackend> backend;
+    bool initialized;
+public:
+    PluginParallelBackendFactory(const std::string& baseName)
+        : baseName_(baseName)
+        , initialized(false)
+    {
+        // nothing, plugins are loaded on demand
+    }
+
+    std::shared_ptr<cv::parallel::ParallelForAPI> create() const CV_OVERRIDE
+    {
+        if (!initialized)
+        {
+            const_cast<PluginParallelBackendFactory*>(this)->initBackend();
+        }
+        if (backend)
+            return backend->create();
+        return std::shared_ptr<cv::parallel::ParallelForAPI>();
+    }
+protected:
+    void initBackend()
+    {
+        AutoLock lock(getInitializationMutex());
+        try
+        {
+            if (!initialized)
+                loadPlugin();
+        }
+        catch (...)
+        {
+            CV_LOG_INFO(NULL, "core(parallel): exception during plugin loading: " << baseName_ << ". SKIP");
+        }
+        initialized = true;
+    }
+    void loadPlugin();
+};
+
+static
+std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
+{
+    using namespace cv::utils;
+    using namespace cv::utils::fs;
+    const std::string baseName_l = toLowerCase(baseName);
+    const std::string baseName_u = toUpperCase(baseName);
+    const FileSystemPath_t baseName_l_fs = toFileSystemPath(baseName_l);
+    std::vector<FileSystemPath_t> paths;
+    // TODO OPENCV_PLUGIN_PATH
+    const std::vector<std::string> paths_ = getConfigurationParameterPaths("OPENCV_CORE_PLUGIN_PATH", std::vector<std::string>());
+    if (paths_.size() != 0)
+    {
+        for (size_t i = 0; i < paths_.size(); i++)
+        {
+            paths.push_back(toFileSystemPath(paths_[i]));
+        }
+    }
+    else
+    {
+        FileSystemPath_t binaryLocation;
+        if (getBinLocation(binaryLocation))
+        {
+            binaryLocation = getParent(binaryLocation);
+#ifndef CV_CORE_PARALLEL_PLUGIN_SUBDIRECTORY
+            paths.push_back(binaryLocation);
+#else
+            paths.push_back(binaryLocation + toFileSystemPath("/") + toFileSystemPath(CV_CORE_PARALLEL_PLUGIN_SUBDIRECTORY_STR));
+#endif
+        }
+    }
+    const std::string default_expr = libraryPrefix() + "opencv_core_parallel_" + baseName_l + "*" + librarySuffix();
+    const std::string plugin_expr = getConfigurationParameterString((std::string("OPENCV_CORE_PARALLEL_PLUGIN_") + baseName_u).c_str(), default_expr.c_str());
+    std::vector<FileSystemPath_t> results;
+#ifdef _WIN32
+    FileSystemPath_t moduleName = toFileSystemPath(libraryPrefix() + "opencv_core_parallel_" + baseName_l + librarySuffix());
+    if (plugin_expr != default_expr)
+    {
+        moduleName = toFileSystemPath(plugin_expr);
+        results.push_back(moduleName);
+    }
+    for (const FileSystemPath_t& path : paths)
+    {
+        results.push_back(path + L"\\" + moduleName);
+    }
+    results.push_back(moduleName);
+#else
+    CV_LOG_DEBUG(NULL, "core(parallel): " << baseName << " plugin's glob is '" << plugin_expr << "', " << paths.size() << " location(s)");
+    for (const std::string& path : paths)
+    {
+        if (path.empty())
+            continue;
+        std::vector<std::string> candidates;
+        cv::glob(utils::fs::join(path, plugin_expr), candidates);
+        CV_LOG_DEBUG(NULL, "    - " << path << ": " << candidates.size());
+        copy(candidates.begin(), candidates.end(), back_inserter(results));
+    }
+#endif
+    CV_LOG_DEBUG(NULL, "Found " << results.size() << " plugin(s) for " << baseName);
+    return results;
+}
+
+void PluginParallelBackendFactory::loadPlugin()
+{
+    for (const FileSystemPath_t& plugin : getPluginCandidates(baseName_))
+    {
+        auto lib = std::make_shared<cv::plugin::impl::DynamicLib>(plugin);
+        if (!lib->isLoaded())
+        {
+            continue;
+        }
+        try
+        {
+            auto pluginBackend = std::make_shared<PluginParallelBackend>(lib);
+            if (!pluginBackend)
+            {
+                continue;
+            }
+            if (pluginBackend->plugin_api_ == NULL)
+            {
+                CV_LOG_ERROR(NULL, "core(parallel): no compatible plugin API for backend: " << baseName_ << " in " << toPrintablePath(plugin));
+                continue;
+            }
+#if !defined(_WIN32)
+            // NB: we are going to use parallel backend, so prevent automatic library unloading
+            // (avoid uncontrolled crashes in worker threads of underlying libraries: libgomp, libtbb)
+            // details: https://github.com/opencv/opencv/pull/19470#pullrequestreview-589834777
+            lib->disableAutomaticLibraryUnloading();
+#endif
+            backend = pluginBackend;
+            return;
+        }
+        catch (...)
+        {
+            CV_LOG_WARNING(NULL, "core(parallel): exception during plugin initialization: " << toPrintablePath(plugin) << ". SKIP");
+        }
+    }
+}
+
+#endif  // OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(PARALLEL_ENABLE_PLUGINS)
+
+}  // namespace
+
+namespace parallel {
+
+std::shared_ptr<IParallelBackendFactory> createPluginParallelBackendFactory(const std::string& baseName)
+{
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(PARALLEL_ENABLE_PLUGINS)
+    return std::make_shared<impl::PluginParallelBackendFactory>(baseName);
+#else
+    CV_UNUSED(baseName);
+    return std::shared_ptr<IParallelBackendFactory>();
+#endif
+}
+
+}}  // namespace
diff --git a/modules/core/src/parallel/registry_parallel.hpp b/modules/core/src/parallel/registry_parallel.hpp
new file mode 100644
index 0000000000..97464f278f
--- /dev/null
+++ b/modules/core/src/parallel/registry_parallel.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_REGISTRY_HPP
+#define OPENCV_CORE_PARALLEL_REGISTRY_HPP
+
+#include "factory_parallel.hpp"
+
+namespace cv { namespace parallel {
+
+struct ParallelBackendInfo
+{
+    int priority;     // 1000-<index*10> - default builtin priority
+                      // 0 - disabled (OPENCV_PARALLEL_PRIORITY_<name> = 0)
+                      // >10000 - prioritized list (OPENCV_PARALLEL_PRIORITY_LIST)
+    std::string name;
+    std::shared_ptr<IParallelBackendFactory> backendFactory;
+};
+
+const std::vector<ParallelBackendInfo>& getParallelBackendsInfo();
+
+}} // namespace
+
+#endif // OPENCV_CORE_PARALLEL_REGISTRY_HPP
diff --git a/modules/core/src/parallel/registry_parallel.impl.hpp b/modules/core/src/parallel/registry_parallel.impl.hpp
new file mode 100644
index 0000000000..c8b57e7d6c
--- /dev/null
+++ b/modules/core/src/parallel/registry_parallel.impl.hpp
@@ -0,0 +1,173 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of parallel.cpp
+//
+
+#include "opencv2/core/utils/filesystem.private.hpp"  // OPENCV_HAVE_FILESYSTEM_SUPPORT
+
+namespace cv { namespace parallel {
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(PARALLEL_ENABLE_PLUGINS)
+#define DECLARE_DYNAMIC_BACKEND(name) \
+ParallelBackendInfo { \
+    1000, name, createPluginParallelBackendFactory(name) \
+},
+#else
+#define DECLARE_DYNAMIC_BACKEND(name) /* nothing */
+#endif
+
+#define DECLARE_STATIC_BACKEND(name, createBackendAPI) \
+ParallelBackendInfo { \
+    1000, name, std::make_shared<cv::parallel::StaticBackendFactory>([=] () -> std::shared_ptr<cv::parallel::ParallelForAPI> { return createBackendAPI(); }) \
+},
+
+static
+std::vector<ParallelBackendInfo>& getBuiltinParallelBackendsInfo()
+{
+    static std::vector<ParallelBackendInfo> g_backends
+    {
+#ifdef HAVE_TBB
+        DECLARE_STATIC_BACKEND("TBB", createParallelBackendTBB)
+#elif defined(PARALLEL_ENABLE_PLUGINS)
+        DECLARE_DYNAMIC_BACKEND("ONETBB")   // dedicated oneTBB plugin (interface >= 12000, binary incompatibe with TBB 2017-2020)
+        DECLARE_DYNAMIC_BACKEND("TBB")      // generic TBB plugins
+#endif
+
+#ifdef HAVE_OPENMP
+        DECLARE_STATIC_BACKEND("OPENMP", createParallelBackendOpenMP)
+#elif defined(PARALLEL_ENABLE_PLUGINS)
+        DECLARE_DYNAMIC_BACKEND("OPENMP")  // TODO Intel OpenMP?
+#endif
+    };
+    return g_backends;
+};
+
+static
+bool sortByPriority(const ParallelBackendInfo &lhs, const ParallelBackendInfo &rhs)
+{
+    return lhs.priority > rhs.priority;
+}
+
+/** @brief Manages list of enabled backends
+ */
+class ParallelBackendRegistry
+{
+protected:
+    std::vector<ParallelBackendInfo> enabledBackends;
+    ParallelBackendRegistry()
+    {
+        enabledBackends = getBuiltinParallelBackendsInfo();
+        int N = (int)enabledBackends.size();
+        for (int i = 0; i < N; i++)
+        {
+            ParallelBackendInfo& info = enabledBackends[i];
+            info.priority = 1000 - i * 10;
+        }
+        CV_LOG_DEBUG(NULL, "core(parallel): Builtin backends(" << N << "): " << dumpBackends());
+        if (readPrioritySettings())
+        {
+            CV_LOG_INFO(NULL, "core(parallel): Updated backends priorities: " << dumpBackends());
+            N = (int)enabledBackends.size();
+        }
+        int enabled = 0;
+        for (int i = 0; i < N; i++)
+        {
+            ParallelBackendInfo& info = enabledBackends[enabled];
+            if (enabled != i)
+                info = enabledBackends[i];
+            size_t param_priority = utils::getConfigurationParameterSizeT(cv::format("OPENCV_PARALLEL_PRIORITY_%s", info.name.c_str()).c_str(), (size_t)info.priority);
+            CV_Assert(param_priority == (size_t)(int)param_priority); // overflow check
+            if (param_priority > 0)
+            {
+                info.priority = (int)param_priority;
+                enabled++;
+            }
+            else
+            {
+                CV_LOG_INFO(NULL, "core(parallel): Disable backend: " << info.name);
+            }
+        }
+        enabledBackends.resize(enabled);
+        CV_LOG_DEBUG(NULL, "core(parallel): Available backends(" << enabled << "): " << dumpBackends());
+        std::sort(enabledBackends.begin(), enabledBackends.end(), sortByPriority);
+        CV_LOG_INFO(NULL, "core(parallel): Enabled backends(" << enabled << ", sorted by priority): " << (enabledBackends.empty() ? std::string("N/A") : dumpBackends()));
+    }
+
+    static std::vector<std::string> tokenize_string(const std::string& input, char token)
+    {
+        std::vector<std::string> result;
+        std::string::size_type prev_pos = 0, pos = 0;
+        while((pos = input.find(token, pos)) != std::string::npos)
+        {
+            result.push_back(input.substr(prev_pos, pos-prev_pos));
+            prev_pos = ++pos;
+        }
+        result.push_back(input.substr(prev_pos));
+        return result;
+    }
+    bool readPrioritySettings()
+    {
+        bool hasChanges = false;
+        cv::String prioritized_backends = utils::getConfigurationParameterString("OPENCV_PARALLEL_PRIORITY_LIST", NULL);
+        if (prioritized_backends.empty())
+            return hasChanges;
+        CV_LOG_INFO(NULL, "core(parallel): Configured priority list (OPENCV_PARALLEL_PRIORITY_LIST): " << prioritized_backends);
+        const std::vector<std::string> names = tokenize_string(prioritized_backends, ',');
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            const std::string& name = names[i];
+            int priority = (int)(100000 + (names.size() - i) * 1000);
+            bool found = false;
+            for (size_t k = 0; k < enabledBackends.size(); k++)
+            {
+                ParallelBackendInfo& info = enabledBackends[k];
+                if (name == info.name)
+                {
+                    info.priority = priority;
+                    CV_LOG_DEBUG(NULL, "core(parallel): New backend priority: '" << name << "' => " << info.priority);
+                    found = true;
+                    hasChanges = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                CV_LOG_INFO(NULL, "core(parallel): Adding parallel backend (plugin): '" << name << "'");
+                enabledBackends.push_back(ParallelBackendInfo{priority, name, createPluginParallelBackendFactory(name)});
+                hasChanges = true;
+            }
+        }
+        return hasChanges;
+    }
+public:
+    std::string dumpBackends() const
+    {
+        std::ostringstream os;
+        for (size_t i = 0; i < enabledBackends.size(); i++)
+        {
+            if (i > 0) os << "; ";
+            const ParallelBackendInfo& info = enabledBackends[i];
+            os << info.name << '(' << info.priority << ')';
+        }
+        return os.str();
+    }
+
+    static ParallelBackendRegistry& getInstance()
+    {
+        static ParallelBackendRegistry g_instance;
+        return g_instance;
+    }
+
+    inline const std::vector<ParallelBackendInfo>& getEnabledBackends() const { return enabledBackends; }
+};
+
+
+const std::vector<ParallelBackendInfo>& getParallelBackendsInfo()
+{
+    return cv::parallel::ParallelBackendRegistry::getInstance().getEnabledBackends();
+}
+
+}} // namespace
diff --git a/modules/core/src/parallel_impl.cpp b/modules/core/src/parallel_impl.cpp
index 09579a3b14..25bf4adce8 100644
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@@ -356,6 +356,16 @@ public:
 };
 
 
+// Disable thread sanitization check when CV_USE_GLOBAL_WORKERS_COND_VAR is not
+// set because it triggers as the main thread reads isActive while the children
+// thread writes it (but it all works out because a mutex is locked in the main
+// thread and isActive re-checked).
+// This is to solve issue #19463.
+#if !defined(CV_USE_GLOBAL_WORKERS_COND_VAR) && defined(__clang__) && defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+__attribute__((no_sanitize("thread")))
+#endif
+#endif
 void WorkerThread::thread_body()
 {
     (void)cv::utils::getThreadID(); // notify OpenCV about new thread
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 21e281c007..5a0a7637c2 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -43,6 +43,10 @@
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__
 
+#ifdef BUILD_PLUGIN
+#include "opencv2/core/utility.hpp"
+#else  // BUILD_PLUGIN
+
 #include "opencv2/opencv_modules.hpp"
 #include "cvconfig.h"
 
@@ -375,4 +379,5 @@ int cv_snprintf(char* buf, int len, const char* fmt, ...);
 int cv_vsnprintf(char* buf, int len, const char* fmt, va_list args);
 }
 
-#endif /*_CXCORE_INTERNAL_H_*/
+#endif  // BUILD_PLUGIN
+#endif  // __OPENCV_PRECOMP_H__
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index 5d6dfb084a..0647c95486 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -750,6 +750,9 @@ void cv::randShuffle( InputOutputArray _dst, double iterFactor, RNG* _rng )
     func( dst, rng, iterFactor );
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL void
 cvRandArr( CvRNG* _rng, CvArr* arr, int disttype, CvScalar param1, CvScalar param2 )
 {
@@ -767,6 +770,9 @@ CV_IMPL void cvRandShuffle( CvArr* arr, CvRNG* _rng, double iter_factor )
     cv::randShuffle( dst, iter_factor, &rng );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
+
+
 // Mersenne Twister random number generator.
 // Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
 
diff --git a/modules/core/src/stat_c.cpp b/modules/core/src/stat_c.cpp
index d7355b9f94..8b6f0f09e4 100644
--- a/modules/core/src/stat_c.cpp
+++ b/modules/core/src/stat_c.cpp
@@ -5,6 +5,8 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
 {
     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
@@ -117,3 +119,5 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
 
     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index e0fdde33e8..97a2a289c7 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -128,11 +128,14 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #endif
 
 
-#if CV_VSX && defined __linux__
+#if (defined __ppc64__ || defined __PPC64__) && defined __linux__
 # include "sys/auxv.h"
 # ifndef AT_HWCAP2
 #   define AT_HWCAP2 26
 # endif
+# ifndef PPC_FEATURE2_ARCH_2_07
+#   define PPC_FEATURE2_ARCH_2_07 0x80000000
+# endif
 # ifndef PPC_FEATURE2_ARCH_3_00
 #   define PPC_FEATURE2_ARCH_3_00 0x00800000
 # endif
@@ -345,7 +348,6 @@ struct HWFeatures
 
     HWFeatures(bool run_initialize = false)
     {
-        memset( have, 0, sizeof(have[0]) * MAX_FEATURE );
         if (run_initialize)
             initialize();
     }
@@ -589,14 +591,25 @@ struct HWFeatures
     #ifdef __mips_msa
         have[CV_CPU_MSA] = true;
     #endif
-    // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
-    have[CV_CPU_VSX] = (CV_VSX);
-    // TODO: Check VSX3 availability in runtime for other platforms
-    #if CV_VSX && defined __linux__
-        uint64 hwcap2 = getauxval(AT_HWCAP2);
-        have[CV_CPU_VSX3] = (hwcap2 & PPC_FEATURE2_ARCH_3_00);
+
+    #if (defined __ppc64__ || defined __PPC64__) && defined __linux__
+        unsigned int hwcap = getauxval(AT_HWCAP);
+        if (hwcap & PPC_FEATURE_HAS_VSX) {
+            hwcap = getauxval(AT_HWCAP2);
+            if (hwcap & PPC_FEATURE2_ARCH_3_00) {
+                have[CV_CPU_VSX] = have[CV_CPU_VSX3] = true;
+            } else {
+                have[CV_CPU_VSX] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
+            }
+        }
     #else
-        have[CV_CPU_VSX3] = (CV_VSX3);
+        // TODO: AIX, FreeBSD
+        #if CV_VSX || defined _ARCH_PWR8 || defined __POWER9_VECTOR__
+            have[CV_CPU_VSX] = true;
+        #endif
+        #if CV_VSX3 || defined __POWER9_VECTOR__
+            have[CV_CPU_VSX3] = true;
+        #endif
     #endif
 
     #if defined __riscv && defined __riscv_vector
@@ -730,7 +743,7 @@ struct HWFeatures
         }
     }
 
-    bool have[MAX_FEATURE+1];
+    bool have[MAX_FEATURE+1]{};
 };
 
 static HWFeatures  featuresEnabled(true), featuresDisabled = HWFeatures(false);
@@ -1810,7 +1823,7 @@ class ParseError
 {
     std::string bad_value;
 public:
-    ParseError(const std::string bad_value_) :bad_value(bad_value_) {}
+    ParseError(const std::string &bad_value_) :bad_value(bad_value_) {}
     std::string toString(const std::string &param) const
     {
         std::ostringstream out;
@@ -2313,6 +2326,13 @@ public:
             ippTopFeatures = ippCPUID_SSE42;
 
         pIppLibInfo = ippiGetLibVersion();
+
+        // workaround: https://github.com/opencv/opencv/issues/12959
+        std::string ippName(pIppLibInfo->Name ? pIppLibInfo->Name : "");
+        if (ippName.find("SSE4.2") != std::string::npos)
+        {
+            ippTopFeatures = ippCPUID_SSE42;
+        }
     }
 
 public:
@@ -2344,16 +2364,12 @@ unsigned long long getIppFeatures()
 #endif
 }
 
-unsigned long long getIppTopFeatures();
-
+#ifdef HAVE_IPP
 unsigned long long getIppTopFeatures()
 {
-#ifdef HAVE_IPP
     return getIPPSingleton().ippTopFeatures;
-#else
-    return 0;
-#endif
 }
+#endif
 
 void setIppStatus(int status, const char * const _funcname, const char * const _filename, int _line)
 {
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 0ec6270a70..09ba92ecde 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -230,7 +230,7 @@ UMatDataAutoLock::~UMatDataAutoLock()
 
 //////////////////////////////// UMat ////////////////////////////////
 
-UMat::UMat(UMatUsageFlags _usageFlags)
+UMat::UMat(UMatUsageFlags _usageFlags) CV_NOEXCEPT
 : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {}
 
@@ -1318,88 +1318,6 @@ UMat UMat::t() const
     return m;
 }
 
-UMat UMat::inv(int method) const
-{
-    UMat m;
-    invert(*this, m, method);
-    return m;
-}
-
-UMat UMat::mul(InputArray m, double scale) const
-{
-    UMat dst;
-    multiply(*this, m, dst, scale);
-    return dst;
-}
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
-{
-    UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
-
-    int type = src1.type(), depth = CV_MAT_DEPTH(type),
-            kercn = ocl::predictOptimalVectorWidth(src1, src2);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
-    if ( !doubleSupport && depth == CV_64F )
-        return false;
-
-    int dbsize = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
-    int ddepth = std::max(CV_32F, depth);
-
-    int wgs2_aligned = 1;
-    while (wgs2_aligned < (int)wgs)
-        wgs2_aligned <<= 1;
-    wgs2_aligned >>= 1;
-
-    char cvt[40];
-    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
-                         "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
-                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
-                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
-                         ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
-                         (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                         _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
-                         _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
-    if (k.empty())
-        return false;
-
-    UMat db(1, dbsize, ddepth);
-
-    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
-            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
-            dbarg = ocl::KernelArg::PtrWriteOnly(db);
-
-    k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
-
-    size_t globalsize = dbsize * wgs;
-    if (k.run(1, &globalsize, &wgs, true))
-    {
-        res = sum(db.getMat(ACCESS_READ))[0];
-        return true;
-    }
-    return false;
-}
-
-#endif
-
-double UMat::dot(InputArray m) const
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_Assert(m.sameSize(*this) && m.type() == type());
-
-#ifdef HAVE_OPENCL
-    double r = 0;
-    CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
-#endif
-
-    return getMat(ACCESS_READ).dot(m);
-}
-
 UMat UMat::zeros(int rows, int cols, int type)
 {
     return UMat(rows, cols, type, Scalar::all(0));
@@ -1430,18 +1348,6 @@ UMat UMat::ones(int ndims, const int* sz, int type)
     return UMat(ndims, sz, type, Scalar(1));
 }
 
-UMat UMat::eye(int rows, int cols, int type)
-{
-    return UMat::eye(Size(cols, rows), type);
-}
-
-UMat UMat::eye(Size size, int type)
-{
-    UMat m(size, type);
-    setIdentity(m);
-    return m;
-}
-
 }
 
 /* End of file. */
diff --git a/modules/core/src/utils/filesystem.cpp b/modules/core/src/utils/filesystem.cpp
index e75640b86b..17004b27dd 100644
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@@ -587,3 +587,8 @@ cv::String getCacheDirectory(const char* /*sub_directory_name*/, const char* /*c
 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 
 }}} // namespace
+
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
+#include "plugin_loader.impl.hpp"
+#endif
diff --git a/modules/core/src/utils/plugin_loader.impl.hpp b/modules/core/src/utils/plugin_loader.impl.hpp
new file mode 100644
index 0000000000..4173c9d802
--- /dev/null
+++ b/modules/core/src/utils/plugin_loader.impl.hpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of filesystem.cpp
+//
+
+#include "opencv2/core/utils/plugin_loader.private.hpp"
+
+#if !OPENCV_HAVE_FILESYSTEM_SUPPORT
+#error "Invalid build configuration"
+#endif
+
+#if 0  // TODO
+#ifdef NDEBUG
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#else
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
+#endif
+#include <opencv2/core/utils/logger.hpp>
+#endif
+
+namespace cv { namespace plugin { namespace impl {
+
+DynamicLib::DynamicLib(const FileSystemPath_t& filename)
+    : handle(0), fname(filename), disableAutoUnloading_(false)
+{
+    libraryLoad(filename);
+}
+
+DynamicLib::~DynamicLib()
+{
+    if (!disableAutoUnloading_)
+    {
+        libraryRelease();
+    }
+    else if (handle)
+    {
+        CV_LOG_INFO(NULL, "skip auto unloading (disabled): " << toPrintablePath(fname));
+        handle = 0;
+    }
+}
+
+void* DynamicLib::getSymbol(const char* symbolName) const
+{
+    if (!handle)
+    {
+        return 0;
+    }
+    void* res = getSymbol_(handle, symbolName);
+    if (!res)
+    {
+        CV_LOG_DEBUG(NULL, "No symbol '" << symbolName << "' in " << toPrintablePath(fname));
+    }
+    return res;
+}
+
+const std::string DynamicLib::getName() const
+{
+    return toPrintablePath(fname);
+}
+
+void DynamicLib::libraryLoad(const FileSystemPath_t& filename)
+{
+    handle = libraryLoad_(filename);
+    CV_LOG_INFO(NULL, "load " << toPrintablePath(filename) << " => " << (handle ? "OK" : "FAILED"));
+}
+
+void DynamicLib::libraryRelease()
+{
+    if (handle)
+    {
+        CV_LOG_INFO(NULL, "unload "<< toPrintablePath(fname));
+        libraryRelease_(handle);
+        handle = 0;
+    }
+}
+
+}}}  // namespace
diff --git a/modules/core/src/va_intel.cpp b/modules/core/src/va_intel.cpp
index c640a08658..1d2b1cbf32 100644
--- a/modules/core/src/va_intel.cpp
+++ b/modules/core/src/va_intel.cpp
@@ -33,6 +33,17 @@ using namespace cv;
 #endif
 #endif
 
+#ifdef HAVE_VA
+#ifndef OPENCV_LIBVA_LINK
+#include "va_wrapper.impl.hpp"
+#else
+namespace cv { namespace detail {
+static void init_libva() { /* nothing */ }
+}}  // namespace
+#endif
+using namespace cv::detail;
+#endif
+
 namespace cv { namespace va_intel {
 
 #ifdef HAVE_VA_INTEL
@@ -54,6 +65,8 @@ Context& initializeContextFromVA(VADisplay display, bool tryInterop)
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
+    init_libva();
+
 #   ifdef HAVE_VA_INTEL
     contextInitialized = false;
     if (tryInterop)
@@ -176,7 +189,7 @@ static bool ocl_convert_nv12_to_bgr(cl_mem clImageY, cl_mem clImageUV, cl_mem cl
 
     k.args(clImageY, clImageUV, clBuffer, step, cols, rows);
 
-    size_t globalsize[] = { (size_t)cols, (size_t)rows };
+    size_t globalsize[] = { (size_t)cols/2, (size_t)rows/2 };
     return k.run(2, globalsize, 0, false);
 }
 
@@ -189,7 +202,7 @@ static bool ocl_convert_bgr_to_nv12(cl_mem clBuffer, int step, int cols, int row
 
     k.args(clBuffer, step, cols, rows, clImageY, clImageUV);
 
-    size_t globalsize[] = { (size_t)cols, (size_t)rows };
+    size_t globalsize[] = { (size_t)cols/2, (size_t)rows/2 };
     return k.run(2, globalsize, 0, false);
 }
 #endif // HAVE_VA_INTEL
@@ -507,6 +520,8 @@ void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface,
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
+    init_libva();
+
     const int stype = CV_8UC3;
 
     int srcType = src.type();
@@ -611,6 +626,8 @@ void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, Out
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
+    init_libva();
+
     const int dtype = CV_8UC3;
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
diff --git a/modules/core/src/va_wrapper.impl.hpp b/modules/core/src/va_wrapper.impl.hpp
new file mode 100644
index 0000000000..260d3ba49b
--- /dev/null
+++ b/modules/core/src/va_wrapper.impl.hpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of va_intel.cpp
+//
+
+#include "opencv2/core/utils/plugin_loader.private.hpp"  // DynamicLib
+
+namespace cv { namespace detail {
+
+typedef VAStatus (*FN_vaDeriveImage)(VADisplay dpy, VASurfaceID surface, VAImage *image);
+typedef VAStatus (*FN_vaDestroyImage)(VADisplay dpy, VAImageID image);
+typedef VAStatus (*FN_vaMapBuffer)(VADisplay dpy, VABufferID buf_id, void **pbuf);
+typedef VAStatus (*FN_vaSyncSurface)(VADisplay dpy, VASurfaceID render_target);
+typedef VAStatus (*FN_vaUnmapBuffer)(VADisplay dpy, VABufferID buf_id);
+
+static FN_vaDeriveImage fn_vaDeriveImage = NULL;
+static FN_vaDestroyImage fn_vaDestroyImage = NULL;
+static FN_vaMapBuffer fn_vaMapBuffer = NULL;
+static FN_vaSyncSurface fn_vaSyncSurface = NULL;
+static FN_vaUnmapBuffer fn_vaUnmapBuffer = NULL;
+
+#define vaDeriveImage fn_vaDeriveImage
+#define vaDestroyImage fn_vaDestroyImage
+#define vaMapBuffer fn_vaMapBuffer
+#define vaSyncSurface fn_vaSyncSurface
+#define vaUnmapBuffer fn_vaUnmapBuffer
+
+
+static std::shared_ptr<cv::plugin::impl::DynamicLib> loadLibVA()
+{
+    std::shared_ptr<cv::plugin::impl::DynamicLib> lib;
+    const char* envPath = getenv("OPENCV_LIBVA_RUNTIME");
+    if (envPath)
+    {
+        lib = std::make_shared<cv::plugin::impl::DynamicLib>(envPath);
+        return lib;
+    }
+    static const char* const candidates[] = {
+        "libva.so",
+        "libva.so.2",
+        "libva.so.1",
+    };
+    for (int i = 0; i < 3; ++i)
+    {
+        lib = std::make_shared<cv::plugin::impl::DynamicLib>(candidates[i]);
+        if (lib->isLoaded())
+            break;
+    }
+    return lib;
+}
+static void init_libva()
+{
+    static bool initialized = false;
+    static auto library = loadLibVA();
+    if (!initialized)
+    {
+        if (!library || !library->isLoaded())
+        {
+            library.reset();
+            CV_Error(cv::Error::StsBadFunc, "OpenCV can't load VA library (libva)");
+        }
+        auto& lib = *library.get();
+#define VA_LOAD_SYMBOL(name) fn_ ## name = reinterpret_cast<FN_ ## name>(lib.getSymbol(#name)); \
+        if (!fn_ ## name) \
+        { \
+            library.reset(); \
+            initialized = true; \
+            CV_Error_(cv::Error::StsBadFunc, ("OpenCV can't load VA library (libva), missing symbol: %s", #name)); \
+        }
+
+        VA_LOAD_SYMBOL(vaDeriveImage);
+        VA_LOAD_SYMBOL(vaDestroyImage);
+        VA_LOAD_SYMBOL(vaMapBuffer);
+        VA_LOAD_SYMBOL(vaSyncSurface);
+        VA_LOAD_SYMBOL(vaUnmapBuffer);
+        initialized = true;
+    }
+    if (!library)
+        CV_Error(cv::Error::StsBadFunc, "OpenCV can't load/initialize VA library (libva)");
+}
+
+}}  // namespace
diff --git a/modules/core/test/ocl/test_opencl.cpp b/modules/core/test/ocl/test_opencl.cpp
index 27cd82d424..e639f72948 100644
--- a/modules/core/test/ocl/test_opencl.cpp
+++ b/modules/core/test/ocl/test_opencl.cpp
@@ -120,6 +120,11 @@ TEST(OpenCL, support_SPIR_programs)
         cv::ocl::ProgramSource src = cv::ocl::ProgramSource::fromSPIR(module_name, "simple_spir", (uchar*)&program_binary_code[0], program_binary_code.size(), "");
         cv::String errmsg;
         cv::ocl::Program program(src, "", errmsg);
+        if (program.ptr() == NULL && device.isAMD())
+        {
+            // https://community.amd.com/t5/opencl/spir-support-in-new-drivers-lost/td-p/170165
+            throw cvtest::SkipTestException("Bypass AMD OpenCL runtime bug: 'cl_khr_spir' extension is declared, but it doesn't really work");
+        }
         ASSERT_TRUE(program.ptr() != NULL);
         k.create("test_kernel", program);
     }
@@ -127,4 +132,120 @@ TEST(OpenCL, support_SPIR_programs)
     testOpenCLKernel(k);
 }
 
+TEST(OpenCL, move_construct_assign)
+{
+    cv::ocl::Context ctx1 = cv::ocl::Context::getDefault();
+    if (!ctx1.ptr())
+    {
+        throw cvtest::SkipTestException("OpenCL is not available");
+    }
+    void* const ctx_ptr = ctx1.ptr();
+    cv::ocl::Context ctx2(std::move(ctx1));
+    ASSERT_EQ(ctx1.ptr(), nullptr);
+    ASSERT_EQ(ctx2.ptr(), ctx_ptr);
+    cv::ocl::Context ctx3 = std::move(ctx2);
+    ASSERT_EQ(ctx2.ptr(), nullptr);
+    ASSERT_EQ(ctx3.ptr(), ctx_ptr);
+
+    cv::ocl::Platform pl1 = cv::ocl::Platform::getDefault();
+    void* const pl_ptr = pl1.ptr();
+    cv::ocl::Platform pl2(std::move(pl1));
+    ASSERT_EQ(pl1.ptr(), nullptr);
+    ASSERT_EQ(pl2.ptr(), pl_ptr);
+    cv::ocl::Platform pl3 = std::move(pl2);
+    ASSERT_EQ(pl2.ptr(), nullptr);
+    ASSERT_EQ(pl3.ptr(), pl_ptr);
+
+    std::vector<cv::ocl::PlatformInfo> platformInfos;
+    cv::ocl::getPlatfomsInfo(platformInfos);
+    const cv::String pi_name = platformInfos[0].name();
+    cv::ocl::PlatformInfo pinfo2(std::move(platformInfos[0]));
+    ASSERT_EQ(platformInfos[0].name(), cv::String());
+    ASSERT_EQ(pinfo2.name(), pi_name);
+    cv::ocl::PlatformInfo pinfo3 = std::move(pinfo2);
+    ASSERT_EQ(pinfo2.name(), cv::String());
+    ASSERT_EQ(pinfo3.name(), pi_name);
+
+    cv::ocl::Queue q1 = cv::ocl::Queue::getDefault();
+    void* const q_ptr = q1.ptr();
+    cv::ocl::Queue q2(std::move(q1));
+    ASSERT_EQ(q1.ptr(), nullptr);
+    ASSERT_EQ(q2.ptr(), q_ptr);
+    cv::ocl::Queue q3 = std::move(q2);
+    ASSERT_EQ(q2.ptr(), nullptr);
+    ASSERT_EQ(q3.ptr(), q_ptr);
+
+    cv::ocl::Device d1 = cv::ocl::Device::getDefault();
+    if (!d1.compilerAvailable())
+    {
+        throw cvtest::SkipTestException("OpenCL compiler is not available");
+    }
+    void* const d_ptr = d1.ptr();
+    cv::ocl::Device d2(std::move(d1));
+    ASSERT_EQ(d1.ptr(), nullptr);
+    ASSERT_EQ(d2.ptr(), d_ptr);
+    cv::ocl::Device d3 = std::move(d2);
+    ASSERT_EQ(d2.ptr(), nullptr);
+    ASSERT_EQ(d3.ptr(), d_ptr);
+
+    if (d3.imageSupport()) {
+        cv::UMat umat1 = cv::UMat::ones(640, 480, CV_32FC1);
+        cv::ocl::Image2D img1(umat1);
+        void *const img_ptr = img1.ptr();
+        cv::ocl::Image2D img2(std::move(img1));
+        ASSERT_EQ(img1.ptr(), nullptr);
+        ASSERT_EQ(img2.ptr(), img_ptr);
+        cv::ocl::Image2D img3 = std::move(img2);
+        ASSERT_EQ(img2.ptr(), nullptr);
+        ASSERT_EQ(img3.ptr(), img_ptr);
+    }
+
+    static const char* opencl_kernel_src =
+"__kernel void test_kernel(__global const uchar* src, int src_step, int src_offset,\n"
+"                          __global uchar* dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
+"                          int c)\n"
+"{\n"
+"   int x = get_global_id(0);\n"
+"   int y = get_global_id(1);\n"
+"   if (x < dst_cols && y < dst_rows)\n"
+"   {\n"
+"       int src_idx = y * src_step + x + src_offset;\n"
+"       int dst_idx = y * dst_step + x + dst_offset;\n"
+"       dst[dst_idx] = src[src_idx] + c;\n"
+"   }\n"
+"}\n";
+    cv::String module_name; // empty to disable OpenCL cache
+    cv::ocl::ProgramSource ps1(module_name, "move_construct_assign", opencl_kernel_src, "");
+    cv::ocl::ProgramSource::Impl* const ps_ptr = ps1.getImpl();
+    cv::ocl::ProgramSource ps2(std::move(ps1));
+    ASSERT_EQ(ps1.getImpl(), nullptr);
+    ASSERT_EQ(ps2.getImpl(), ps_ptr);
+    cv::ocl::ProgramSource ps3 = std::move(ps2);
+    ASSERT_EQ(ps2.getImpl(), nullptr);
+    ASSERT_EQ(ps3.getImpl(), ps_ptr);
+
+    cv::String errmsg;
+    cv::ocl::Program prog1(ps3, "", errmsg);
+    void* const prog_ptr = prog1.ptr();
+    ASSERT_NE(prog_ptr, nullptr);
+    cv::ocl::Program prog2(std::move(prog1));
+    ASSERT_EQ(prog1.ptr(), nullptr);
+    ASSERT_EQ(prog2.ptr(), prog_ptr);
+    cv::ocl::Program prog3 = std::move(prog2);
+    ASSERT_EQ(prog2.ptr(), nullptr);
+    ASSERT_EQ(prog3.ptr(), prog_ptr);
+
+    cv::ocl::Kernel k1("test_kernel", prog3);
+    void* const k_ptr = k1.ptr();
+    ASSERT_NE(k_ptr, nullptr);
+    cv::ocl::Kernel k2(std::move(k1));
+    ASSERT_EQ(k1.ptr(), nullptr);
+    ASSERT_EQ(k2.ptr(), k_ptr);
+    cv::ocl::Kernel k3 = std::move(k2);
+    ASSERT_EQ(k2.ptr(), nullptr);
+    ASSERT_EQ(k3.ptr(), k_ptr);
+
+    testOpenCLKernel(k3);
+}
+
 }} // namespace
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 75a7004f81..effb0e68e0 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -2456,4 +2456,16 @@ TEST(Core_MinMaxIdx, rows_overflow)
 }
 
 
+TEST(Core_Magnitude, regression_19506)
+{
+    for (int N = 1; N <= 64; ++N)
+    {
+        Mat a(1, N, CV_32FC1, Scalar::all(1e-20));
+        Mat res;
+        magnitude(a, a, res);
+        EXPECT_LE(cvtest::norm(res, NORM_L1), 1e-15) << N;
+    }
+}
+
+
 }} // namespace
diff --git a/modules/core/test/test_cuda.cpp b/modules/core/test/test_cuda.cpp
new file mode 100755
index 0000000000..a3e0a9034b
--- /dev/null
+++ b/modules/core/test/test_cuda.cpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined(HAVE_CUDA)
+
+#include "test_precomp.hpp"
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(CUDA_Stream, construct_cudaFlags)
+{
+    cv::cuda::Stream stream(cudaStreamNonBlocking);
+    EXPECT_NE(stream.cudaPtr(), nullptr);
+}
+
+}} // namespace
+
+#endif
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 84da496b42..269ebe0f2a 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1466,7 +1466,7 @@ template<typename R> struct TheTest
         R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
         R r2(r1);
         EXPECT_EQ(1.0f, r1.get0());
-        vx_store(data_f32.a.d, r2);
+        v_store(data_f32.a.d, r2);
         EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
 
         out.a.clear();
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 74ee167c54..9b6145d733 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1988,7 +1988,6 @@ class TestInputArrayRangeChecking {
         C(EXPR);
         C(MATX);
         C(STD_VECTOR);
-        C(STD_ARRAY);
         C(NONE);
         C(STD_VECTOR_VECTOR);
         C(STD_BOOL_VECTOR);
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index 3934ceb716..67d0a53995 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -2,6 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
+#include <cmath>
 
 namespace opencv_test { namespace {
 
@@ -189,7 +190,7 @@ TEST(Core_OutputArrayCreate, _13772)
 TEST(Core_String, find_last_of__with__empty_string)
 {
     cv::String s;
-    size_t p = s.find_last_of("q", 0);
+    size_t p = s.find_last_of('q', 0);
     // npos is not exported: EXPECT_EQ(cv::String::npos, p);
     EXPECT_EQ(std::string::npos, p);
 }
@@ -783,5 +784,18 @@ TEST(Core_Check, testSize_1)
     }
 }
 
+TEST(Core_Allocation, alignedAllocation)
+{
+    // iterate from size=1 to approximate byte size of 8K 32bpp image buffer
+    for (int i = 0; i < 200; i++) {
+        const size_t size = static_cast<size_t>(std::pow(1.091, (double)i));
+        void * const buf = cv::fastMalloc(size);
+        ASSERT_NE((uintptr_t)0, (uintptr_t)buf)
+            << "failed to allocate memory";
+        ASSERT_EQ((uintptr_t)0, (uintptr_t)buf % CV_MALLOC_ALIGN)
+            << "memory not aligned to " << CV_MALLOC_ALIGN;
+        cv::fastFree(buf);
+    }
+}
 
 }} // namespace
diff --git a/modules/core/test/test_opencl.cpp b/modules/core/test/test_opencl.cpp
index f4f195ea6e..17cd7b5c89 100644
--- a/modules/core/test/test_opencl.cpp
+++ b/modules/core/test/test_opencl.cpp
@@ -8,6 +8,23 @@
 namespace opencv_test {
 namespace ocl {
 
+static
+testing::internal::ParamGenerator<std::string> getOpenCLTestConfigurations()
+{
+    if (!cv::ocl::useOpenCL())
+    {
+        return testing::ValuesIn(std::vector<std::string>());
+    }
+
+    std::vector<std::string> configurations = {
+        ":GPU:0",
+        ":GPU:1",
+        ":CPU:0",
+    };
+    return testing::ValuesIn(configurations);
+}
+
+
 static void executeUMatCall(bool requireOpenCL = true)
 {
     UMat a(100, 100, CV_8UC1, Scalar::all(0));
@@ -45,7 +62,7 @@ TEST(OCL_Context, createFromDevice)
     EXPECT_TRUE(context.getImpl() == context2.getImpl()) << "Broken cache for OpenCL context (device)";
 }
 
-TEST(OCL_OpenCLExecutionContext, basic)
+TEST(OCL_OpenCLExecutionContextDefault, basic)
 {
     bool useOCL = cv::ocl::useOpenCL();
 
@@ -72,7 +89,7 @@ TEST(OCL_OpenCLExecutionContext, basic)
     EXPECT_TRUE(queue.getImpl() == queue2.getImpl());
 }
 
-TEST(OCL_OpenCLExecutionContext, createAndBind)
+TEST(OCL_OpenCLExecutionContextDefault, createAndBind)
 {
     bool useOCL = cv::ocl::useOpenCL();
 
@@ -106,7 +123,9 @@ TEST(OCL_OpenCLExecutionContext, createAndBind)
     }
 }
 
-TEST(OCL_OpenCLExecutionContext, createGPU)
+typedef testing::TestWithParam<std::string> OCL_OpenCLExecutionContext_P;
+
+TEST_P(OCL_OpenCLExecutionContext_P, multipleBindAndExecute)
 {
     bool useOCL = cv::ocl::useOpenCL();
 
@@ -120,12 +139,11 @@ TEST(OCL_OpenCLExecutionContext, createGPU)
 
     ASSERT_FALSE(ctx.empty());
 
-    ocl::Context context = ocl::Context::create(":GPU:1");
+    std::string opencl_device = GetParam();
+    ocl::Context context = ocl::Context::create(opencl_device);
     if (context.empty())
     {
-        context = ocl::Context::create(":CPU:");
-        if (context.empty())
-            throw SkipTestException("OpenCL GPU1/CPU devices are not available");
+        throw SkipTestException(std::string("OpenCL device is not available: '") + opencl_device + "'");
     }
 
     ocl::Device device = context.device(0);
@@ -135,8 +153,10 @@ TEST(OCL_OpenCLExecutionContext, createGPU)
 
     try
     {
+        std::cout << "ctx2..." << std::endl;
         ctx2.bind();
         executeUMatCall();
+        std::cout << "ctx..." << std::endl;
         ctx.bind();
         executeUMatCall();
     }
@@ -147,7 +167,7 @@ TEST(OCL_OpenCLExecutionContext, createGPU)
     }
 }
 
-TEST(OCL_OpenCLExecutionContext, ScopeTest)
+TEST_P(OCL_OpenCLExecutionContext_P, ScopeTest)
 {
     bool useOCL = cv::ocl::useOpenCL();
 
@@ -161,12 +181,11 @@ TEST(OCL_OpenCLExecutionContext, ScopeTest)
 
     ASSERT_FALSE(ctx.empty());
 
-    ocl::Context context = ocl::Context::create(":GPU:1");
+    std::string opencl_device = GetParam();
+    ocl::Context context = ocl::Context::create(opencl_device);
     if (context.empty())
     {
-        context = ocl::Context::create(":CPU:");
-        if (context.empty())
-            context = ctx.getContext();
+        throw SkipTestException(std::string("OpenCL device is not available: '") + opencl_device + "'");
     }
 
     ocl::Device device = context.device(0);
@@ -188,4 +207,9 @@ TEST(OCL_OpenCLExecutionContext, ScopeTest)
     executeUMatCall();
 }
 
+
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, OCL_OpenCLExecutionContext_P, getOpenCLTestConfigurations());
+
+
 } } // namespace opencv_test::ocl
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 645045674a..934028f3ae 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -1551,4 +1551,14 @@ TEST(Core_MatExpr, empty_check_15760)
     EXPECT_THROW(Mat c = Mat().cross(Mat()), cv::Exception);
 }
 
+TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/issues/19599 (OpenCV 4.x+ only)
+{
+    Mat a(1, 1, CV_32F, Scalar::all(1));
+    Mat b(4, 1, CV_64F, Scalar::all(1));  // MatExpr may convert Scalar to Mat
+    Mat c;
+    EXPECT_NO_THROW(cv::multiply(a, b, c));
+    EXPECT_EQ(1, c.cols);
+    EXPECT_EQ(1, c.rows);
+}
+
 }} // namespace
diff --git a/modules/core/test/test_quaternion.cpp b/modules/core/test/test_quaternion.cpp
index 0025674ec7..4e4e89629c 100644
--- a/modules/core/test/test_quaternion.cpp
+++ b/modules/core/test/test_quaternion.cpp
@@ -3,11 +3,15 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "test_precomp.hpp"
+#include <opencv2/ts/cuda_test.hpp>  // EXPECT_MAT_NEAR
+
 #include <opencv2/core/quaternion.hpp>
-#include <opencv2/ts/cuda_test.hpp>
-using namespace cv;
+#include <opencv2/core/dualquaternion.hpp>
+
 namespace opencv_test{ namespace {
-class QuatTest: public ::testing::Test {
+
+class QuatTest: public ::testing::Test
+{
 protected:
     void SetUp() override
     {
@@ -18,7 +22,7 @@ protected:
     }
     double scalar = 2.5;
     double angle = CV_PI;
-    int qNorm2 = 2;
+    double qNorm2 = 2;
     Vec<double, 3> axis{1, 1, 1};
     Vec<double, 3> unAxis{0, 0, 0};
     Vec<double, 3> unitAxis{1.0 / sqrt(3), 1.0 / sqrt(3), 1.0 / sqrt(3)};
@@ -37,7 +41,8 @@ protected:
 
 };
 
-TEST_F(QuatTest, constructor){
+TEST_F(QuatTest, constructor)
+{
     Vec<double, 4> coeff{1, 2, 3, 4};
     EXPECT_EQ(Quat<double> (coeff), q1);
     EXPECT_EQ(q3, q3UnitAxis);
@@ -78,7 +83,8 @@ TEST_F(QuatTest, constructor){
     EXPECT_EQ(Quatd::createFromRvec(Vec3d(0, 0, 0)), qIdentity);
 }
 
-TEST_F(QuatTest, basicfuns){
+TEST_F(QuatTest, basicfuns)
+{
     Quat<double> q1Conj{1, -2, -3, -4};
     EXPECT_EQ(q3Norm2.normalize(), q3);
     EXPECT_EQ(q1.norm(), sqrt(30));
@@ -124,7 +130,7 @@ TEST_F(QuatTest, basicfuns){
     EXPECT_EQ(exp(qNull), qIdentity);
     EXPECT_EQ(exp(Quatd(0, angle * unitAxis[0] / 2, angle * unitAxis[1] / 2, angle * unitAxis[2] / 2)), q3);
 
-    EXPECT_EQ(power(q3, 2), Quatd::createFromAngleAxis(2*angle, axis));
+    EXPECT_EQ(power(q3, 2.0), Quatd::createFromAngleAxis(2*angle, axis));
     EXPECT_EQ(power(Quatd(0.5, 0.5, 0.5, 0.5), 2.0, assumeUnit), Quatd(-0.5,0.5,0.5,0.5));
     EXPECT_EQ(power(Quatd(0.5, 0.5, 0.5, 0.5), -2.0), Quatd(-0.5,-0.5,-0.5,-0.5));
     EXPECT_EQ(sqrt(q1), power(q1, 0.5));
@@ -160,7 +166,8 @@ TEST_F(QuatTest, basicfuns){
     EXPECT_EQ(tan(atan(q1)), q1);
 }
 
-TEST_F(QuatTest, opeartor){
+TEST_F(QuatTest, test_operator)
+{
     Quatd minusQ{-1, -2, -3, -4};
     Quatd qAdd{3.5, 0, 6.5, 8};
     Quatd qMinus{-1.5, 4, -0.5, 0};
@@ -171,7 +178,15 @@ TEST_F(QuatTest, opeartor){
 
     EXPECT_EQ(-q1, minusQ);
     EXPECT_EQ(q1 + q2, qAdd);
+    EXPECT_EQ(q1 + scalar, Quatd(3.5, 2, 3, 4));
+    EXPECT_EQ(scalar + q1, Quatd(3.5, 2, 3, 4));
+    EXPECT_EQ(q1 + 2.0, Quatd(3, 2, 3, 4));
+    EXPECT_EQ(2.0 + q1, Quatd(3, 2, 3, 4));
     EXPECT_EQ(q1 - q2, qMinus);
+    EXPECT_EQ(q1 - scalar, Quatd(-1.5, 2, 3, 4));
+    EXPECT_EQ(scalar - q1, Quatd(1.5, -2, -3, -4));
+    EXPECT_EQ(q1 - 2.0, Quatd(-1, 2, 3, 4));
+    EXPECT_EQ(2.0 - q1, Quatd(1, -2, -3, -4));
     EXPECT_EQ(q1 * q2, qMultq);
     EXPECT_EQ(q1 * scalar, qMults);
     EXPECT_EQ(scalar * q1, qMults);
@@ -195,7 +210,8 @@ TEST_F(QuatTest, opeartor){
     EXPECT_ANY_THROW(q1.at(4));
 }
 
-TEST_F(QuatTest, quatAttrs){
+TEST_F(QuatTest, quatAttrs)
+{
     double angleQ1 = 2 * acos(1.0 / sqrt(30));
     Vec3d axis1{0.3713906763541037, 0.557086014, 0.742781352};
     Vec<double, 3> q1axis1 = q1.getAxis();
@@ -215,7 +231,8 @@ TEST_F(QuatTest, quatAttrs){
     EXPECT_NEAR(axis1[2], axis1[2], 1e-6);
 }
 
-TEST_F(QuatTest, interpolation){
+TEST_F(QuatTest, interpolation)
+{
     Quatd qNoRot = Quatd::createFromAngleAxis(0, axis);
     Quatd qLerpInter(1.0 / 2, sqrt(3) / 6, sqrt(3) / 6, sqrt(3) / 6);
     EXPECT_EQ(Quatd::lerp(qNoRot, q3, 0), qNoRot);
@@ -250,6 +267,226 @@ TEST_F(QuatTest, interpolation){
     EXPECT_EQ(Quatd::spline(tr1, tr2, tr3, tr3, 0.5), Quatd(0.336889853392, 0.543600719487, 0.543600719487, 0.543600719487));
 }
 
-} // namespace
+static const Quatd qEuler[24] = {
+    Quatd(0.7233214, 0.3919013, 0.2005605, 0.5319728),  //INT_XYZ
+    Quatd(0.8223654, 0.0222635, 0.3604221, 0.4396766),  //INT_XZY
+    Quatd(0.822365, 0.439677, 0.0222635, 0.360422),     //INT_YXZ
+    Quatd(0.723321, 0.531973, 0.391901, 0.20056),       //INT_YZX
+    Quatd(0.723321, 0.20056, 0.531973, 0.391901),       //INT_ZXY
+    Quatd(0.822365, 0.360422, 0.439677, 0.0222635),     //INT_ZYX
+    Quatd(0.653285, 0.65328, 0.369641, -0.0990435),     //INT_XYX
+    Quatd(0.653285, 0.65328, 0.0990435, 0.369641),      //INT_XZX
+    Quatd(0.653285, 0.369641, 0.65328, 0.0990435),      //INT_YXY
+    Quatd(0.653285, -0.0990435, 0.65328, 0.369641),     //INT_YZY
+    Quatd(0.653285, 0.369641, -0.0990435, 0.65328),     //INT_ZXZ
+    Quatd(0.653285, 0.0990435, 0.369641, 0.65328),      //INT_ZYZ
 
-}// opencv_test
\ No newline at end of file
+    Quatd(0.822365, 0.0222635, 0.439677, 0.360422),     //EXT_XYZ
+    Quatd(0.723321, 0.391901, 0.531973, 0.20056),       //EXT_XZY
+    Quatd(0.723321, 0.20056, 0.391901, 0.531973),       //EXT_YXZ
+    Quatd(0.822365, 0.360422, 0.0222635, 0.439677),     //EXT_YZX
+    Quatd(0.822365, 0.439677, 0.360422, 0.0222635),     //EXT_ZXY
+    Quatd(0.723321, 0.531973, 0.20056, 0.391901),       //EXT_ZYX
+    Quatd(0.653285, 0.65328, 0.369641, 0.0990435),      //EXT_XYX
+    Quatd(0.653285, 0.65328, -0.0990435, 0.369641),     //EXT_XZX
+    Quatd(0.653285, 0.369641, 0.65328, -0.0990435),     //EXT_YXY
+    Quatd(0.653285, 0.0990435, 0.65328, 0.369641),      //EXT_YZY
+    Quatd(0.653285, 0.369641, 0.0990435, 0.65328),      //EXT_ZXZ
+    Quatd(0.653285, -0.0990435, 0.369641, 0.65328)      //EXT_ZYZ
+};
+
+TEST_F(QuatTest, EulerAngles)
+{
+    Vec3d test_angle = {0.523598, 0.78539, 1.04719};
+    for (QuatEnum::EulerAnglesType i = QuatEnum::EulerAnglesType::INT_XYZ; i <= QuatEnum::EulerAnglesType::EXT_ZYZ; i = (QuatEnum::EulerAnglesType)(i + 1))
+    {
+        SCOPED_TRACE(cv::format("EulerAnglesType=%d", i));
+        Quatd q = Quatd::createFromEulerAngles(test_angle, i);
+        EXPECT_EQ(q, qEuler[i]);
+        Vec3d Euler_Angles = q.toEulerAngles(i);
+        EXPECT_NEAR(Euler_Angles[0], test_angle[0], 1e-6);
+        EXPECT_NEAR(Euler_Angles[1], test_angle[1], 1e-6);
+        EXPECT_NEAR(Euler_Angles[2], test_angle[2], 1e-6);
+    }
+    Quatd qEuler0 = {0, 0, 0, 0};
+    EXPECT_ANY_THROW(qEuler0.toEulerAngles(QuatEnum::INT_XYZ));
+
+    Quatd qEulerLock1 = {0.5612665, 0.43042, 0.5607083, 0.4304935};
+    Vec3d test_angle_lock1 = {1.3089878, CV_PI * 0.5, 0};
+    Vec3d Euler_Angles_solute_1 = qEulerLock1.toEulerAngles(QuatEnum::INT_XYZ);
+    EXPECT_NEAR(Euler_Angles_solute_1[0], test_angle_lock1[0], 1e-6);
+    EXPECT_NEAR(Euler_Angles_solute_1[1], test_angle_lock1[1], 1e-6);
+    EXPECT_NEAR(Euler_Angles_solute_1[2], test_angle_lock1[2], 1e-6);
+
+    Quatd qEulerLock2 = {0.7010574, 0.0922963, 0.7010573, -0.0922961};
+    Vec3d test_angle_lock2 = {-0.2618, CV_PI * 0.5, 0};
+    Vec3d Euler_Angles_solute_2 = qEulerLock2.toEulerAngles(QuatEnum::INT_ZYX);
+    EXPECT_NEAR(Euler_Angles_solute_2[0], test_angle_lock2[0], 1e-6);
+    EXPECT_NEAR(Euler_Angles_solute_2[1], test_angle_lock2[1], 1e-6);
+    EXPECT_NEAR(Euler_Angles_solute_2[2], test_angle_lock2[2], 1e-6);
+
+    Vec3d test_angle6 = {CV_PI * 0.25, CV_PI * 0.5, CV_PI * 0.25};
+    Vec3d test_angle7 = {CV_PI * 0.5, CV_PI * 0.5, 0};
+    EXPECT_EQ(Quatd::createFromEulerAngles(test_angle6, QuatEnum::INT_ZXY), Quatd::createFromEulerAngles(test_angle7, QuatEnum::INT_ZXY));
+}
+
+
+
+class DualQuatTest: public ::testing::Test
+{
+protected:
+    double scalar = 2.5;
+    double angle = CV_PI;
+    Vec<double, 3> axis{1, 1, 1};
+    Vec<double, 3> unAxis{0, 0, 0};
+    Vec<double, 3> unitAxis{1.0 / sqrt(3), 1.0 / sqrt(3), 1.0 / sqrt(3)};
+    DualQuatd dq1{1, 2, 3, 4, 5, 6, 7, 8};
+    Vec3d trans{0, 0, 5};
+    double rotation_angle = 2.0 / 3 * CV_PI;
+    DualQuatd dq2 = DualQuatd::createFromAngleAxisTrans(rotation_angle, axis, trans);
+    DualQuatd dqAllOne{1, 1, 1, 1, 1, 1, 1, 1};
+    DualQuatd dqAllZero{0, 0, 0, 0, 0, 0, 0, 0};
+    DualQuatd dqIdentity{1, 0, 0, 0, 0, 0, 0, 0};
+    DualQuatd dqTrans{1, 0, 0, 0, 0, 2, 3, 4};
+    DualQuatd dqOnlyTrans{0, 0, 0, 0, 0, 2, 3, 4};
+    DualQuatd dualNumber1{-3,0,0,0,-31.1,0,0,0};
+    DualQuatd dualNumber2{4,0,0,0,5.1,0,0,0};
+};
+
+TEST_F(DualQuatTest, constructor)
+{
+    EXPECT_EQ(dq1, DualQuatd::createFromQuat(Quatd(1, 2, 3, 4), Quatd(5, 6, 7, 8)));
+    EXPECT_EQ(dq2 * dq2.conjugate(), dqIdentity);
+    EXPECT_NEAR(dq2.getRotation(QUAT_ASSUME_UNIT).norm(), 1, 1e-6);
+    EXPECT_NEAR(dq2.getRealPart().dot(dq2.getDualPart()), 0, 1e-6);
+    EXPECT_MAT_NEAR(dq2.getTranslation(QUAT_ASSUME_UNIT), trans, 1e-6);
+    DualQuatd q_conj = DualQuatd::createFromQuat(dq2.getRealPart().conjugate(), -dq2.getDualPart().conjugate());
+    DualQuatd q{1,0,0,0,0,3,0,0};
+    EXPECT_EQ(dq2 * q * q_conj, DualQuatd(1,0,0,0,0,0,3,5));
+    Matx44d R1 = dq2.toMat();
+    DualQuatd dq3 = DualQuatd::createFromMat(R1);
+    EXPECT_EQ(dq3, dq2);
+    axis = axis / std::sqrt(axis.dot(axis));
+    Vec3d moment = 1.0 / 2 * (trans.cross(axis) + axis.cross(trans.cross(axis)) *
+                              std::cos(rotation_angle / 2) / std::sin(rotation_angle / 2));
+    double d = trans.dot(axis);
+    DualQuatd dq4 = DualQuatd::createFromPitch(rotation_angle, d, axis, moment);
+    EXPECT_EQ(dq4, dq3);
+    EXPECT_EQ(dq2, DualQuatd::createFromAffine3(dq2.toAffine3()));
+    EXPECT_EQ(dq1.normalize(), DualQuatd::createFromAffine3(dq1.toAffine3()));
+}
+
+TEST_F(DualQuatTest, test_operator)
+{
+    DualQuatd dq_origin{1, 2, 3, 4, 5, 6, 7, 8};
+    EXPECT_EQ(dq1 - dqAllOne, DualQuatd(0, 1, 2, 3, 4, 5, 6, 7));
+    EXPECT_EQ(-dq1, DualQuatd(-1, -2, -3, -4, -5, -6, -7, -8));
+    EXPECT_EQ(dq1 + dqAllOne, DualQuatd(2, 3, 4, 5, 6, 7, 8, 9));
+    EXPECT_EQ(dq1 / dq1, dqIdentity);
+    DualQuatd dq3{-4, 1, 3, 2, -15.5, 0, -3, 8.5};
+    EXPECT_EQ(dq1 * dq2, dq3);
+    EXPECT_EQ(dq3 / dq2, dq1);
+    DualQuatd dq12{2, 4, 6, 8, 10, 12, 14, 16};
+    EXPECT_EQ(dq1 * 2.0, dq12);
+    EXPECT_EQ(2.0 * dq1, dq12);
+    EXPECT_EQ(dq1 - 1.0, DualQuatd(0, 2, 3, 4, 5, 6, 7, 8));
+    EXPECT_EQ(1.0 - dq1, DualQuatd(0, -2, -3, -4, -5, -6, -7, -8));
+    EXPECT_EQ(dq1 + 1.0, DualQuatd(2, 2, 3, 4, 5, 6, 7, 8));
+    EXPECT_EQ(1.0 + dq1, DualQuatd(2, 2, 3, 4, 5, 6, 7, 8));
+    dq1 += dq2;
+    EXPECT_EQ(dq1, dq_origin + dq2);
+    dq1 -= dq2;
+    EXPECT_EQ(dq1, dq_origin);
+    dq1 *= dq2;
+    EXPECT_EQ(dq1, dq_origin * dq2);
+    dq1 /= dq2;
+    EXPECT_EQ(dq1, dq_origin);
+}
+
+TEST_F(DualQuatTest, basic_ops)
+{
+    EXPECT_EQ(dq1.getRealPart(), Quatd(1, 2, 3, 4));
+    EXPECT_EQ(dq1.getDualPart(), Quatd(5, 6, 7, 8));
+    EXPECT_EQ((dq1 * dq2).conjugate(), conjugate(dq1 * dq2));
+    EXPECT_EQ(dq1.conjugate(), DualQuatd::createFromQuat(dq1.getRealPart().conjugate(), dq1.getDualPart().conjugate()));
+    EXPECT_EQ((dq2 * dq1).conjugate(), dq1.conjugate() * dq2.conjugate());
+    EXPECT_EQ(dq1.conjugate() * dq1, dq1.norm() * dq1.norm());
+    EXPECT_EQ(dq1.conjugate() * dq1, dq1.norm().power(2.0));
+    EXPECT_EQ(dualNumber2.power(2.0), DualQuatd(16, 0, 0, 0, 40.8, 0, 0, 0));
+    EXPECT_EQ(dq1.power(2.0), (2.0 * dq1.log()).exp());
+    EXPECT_EQ(power(dq1, 2.0), (exp(2.0 * log(dq1))));
+    EXPECT_EQ(dq2.power(3.0 / 2, QUAT_ASSUME_UNIT).power(4.0 / 3, QUAT_ASSUME_UNIT), dq2 * dq2);
+    EXPECT_EQ(dq2.power(-0.5).power(2.0), dq2.inv());
+    EXPECT_EQ(power(dq1, dq2), exp(dq2 * log(dq1)));
+    EXPECT_EQ(power(dq2, dq1, QUAT_ASSUME_UNIT), exp(dq1 * log(dq2)));
+    EXPECT_EQ((dq2.norm() * dq1).power(2.0), dq1.power(2.0) * dq2.norm().power(2.0));
+    DualQuatd q1norm = dq1.normalize();
+    EXPECT_EQ(dq2.norm(), dqIdentity);
+    EXPECT_NEAR(q1norm.getRealPart().norm(), 1, 1e-6);
+    EXPECT_NEAR(q1norm.getRealPart().dot(q1norm.getDualPart()), 0, 1e-6);
+    EXPECT_NEAR(dq1.getRotation().norm(), 1, 1e-6);
+    EXPECT_NEAR(dq2.getRotation(QUAT_ASSUME_UNIT).norm(), 1, 1e-6);
+    EXPECT_NEAR(dq2.getRotation(QUAT_ASSUME_UNIT).norm(), 1, 1e-6);
+    EXPECT_MAT_NEAR(Mat(dq2.getTranslation()), Mat(trans), 1e-6);
+    EXPECT_MAT_NEAR(Mat(q1norm.getTranslation(QUAT_ASSUME_UNIT)), Mat(dq1.getTranslation()), 1e-6);
+    EXPECT_EQ(dq2.getTranslation(), dq2.getTranslation(QUAT_ASSUME_UNIT));
+    EXPECT_EQ(dq1.inv() * dq1, dqIdentity);
+    EXPECT_EQ(inv(dq1) * dq1, dqIdentity);
+    EXPECT_EQ(dq2.inv(QUAT_ASSUME_UNIT) * dq2, dqIdentity);
+    EXPECT_EQ(inv(dq2, QUAT_ASSUME_UNIT) * dq2, dqIdentity);
+    EXPECT_EQ(dq2.inv(), dq2.conjugate());
+    EXPECT_EQ(dqIdentity.inv(), dqIdentity);
+    EXPECT_ANY_THROW(dqAllZero.inv());
+    EXPECT_EQ(dqAllZero.exp(), dqIdentity);
+    EXPECT_EQ(exp(dqAllZero), dqIdentity);
+    EXPECT_ANY_THROW(log(dqAllZero));
+    EXPECT_EQ(log(dqIdentity), dqAllZero);
+    EXPECT_EQ(dqIdentity.log(), dqAllZero);
+    EXPECT_EQ(dualNumber1 * dualNumber2, dualNumber2 * dualNumber1);
+    EXPECT_EQ(dualNumber2.exp().log(), dualNumber2);
+    EXPECT_EQ(dq2.log(QUAT_ASSUME_UNIT).exp(), dq2);
+    EXPECT_EQ(exp(log(dq2, QUAT_ASSUME_UNIT)), dq2);
+    EXPECT_EQ(dqIdentity.log(QUAT_ASSUME_UNIT).exp(), dqIdentity);
+    EXPECT_EQ(dq1.log().exp(), dq1);
+    EXPECT_EQ(dqTrans.log().exp(), dqTrans);
+    EXPECT_MAT_NEAR(q1norm.toMat(QUAT_ASSUME_UNIT), dq1.toMat(), 1e-6);
+    Matx44d R1 = dq2.toMat();
+    Mat point = (Mat_<double>(4, 1) << 3, 0, 0, 1);
+    Mat new_point = R1 * point;
+    Mat after = (Mat_<double>(4, 1) << 0, 3, 5 ,1);
+    EXPECT_MAT_NEAR(new_point,  after, 1e-6);
+    Vec<double, 8> vec = dq1.toVec();
+    EXPECT_EQ(DualQuatd(vec), dq1);
+    Affine3d afd = q1norm.toAffine3(QUAT_ASSUME_UNIT);
+    EXPECT_MAT_NEAR(Mat(afd.translation()), Mat(q1norm.getTranslation(QUAT_ASSUME_UNIT)), 1e-6);
+    Affine3d dq1_afd = dq1.toAffine3();
+    EXPECT_MAT_NEAR(dq1_afd.matrix, afd.matrix, 1e-6);
+    EXPECT_ANY_THROW(dqAllZero.toAffine3());
+}
+
+TEST_F(DualQuatTest, interpolation)
+{
+    DualQuatd dq = DualQuatd::createFromAngleAxisTrans(8 * CV_PI / 5, Vec3d{0, 0, 1}, Vec3d{0, 0, 10});
+    EXPECT_EQ(DualQuatd::sclerp(dqIdentity, dq, 0.5), DualQuatd::sclerp(-dqIdentity, dq, 0.5, false));
+    EXPECT_EQ(DualQuatd::sclerp(dqIdentity, dq, 0), -dqIdentity);
+    EXPECT_EQ(DualQuatd::sclerp(dqIdentity, dq2, 1), dq2);
+    EXPECT_EQ(DualQuatd::sclerp(dqIdentity, dq2, 0.4, false, QUAT_ASSUME_UNIT), DualQuatd(0.91354546, 0.23482951, 0.23482951, 0.23482951, -0.23482951, -0.47824988, 0.69589767, 0.69589767));
+    EXPECT_EQ(DualQuatd::dqblend(dqIdentity, dq1.normalize(), 0.2, QUAT_ASSUME_UNIT), DualQuatd::dqblend(dqIdentity, -dq1, 0.2));
+    EXPECT_EQ(DualQuatd::dqblend(dqIdentity, dq2, 0.4), DualQuatd(0.91766294, 0.22941573, 0.22941573, 0.22941573, -0.21130397, -0.48298049, 0.66409818, 0.66409818));
+    DualQuatd gdb = DualQuatd::gdqblend(Vec<DualQuatd, 3>{dqIdentity, dq, dq2}, Vec3d{0.4, 0, 0.6}, QUAT_ASSUME_UNIT);
+    EXPECT_EQ(gdb, DualQuatd::dqblend(dqIdentity, dq2, 0.6));
+    EXPECT_ANY_THROW(DualQuatd::gdqblend(Vec<DualQuatd, 1>{dq2}, Vec2d{0.5, 0.5}));
+    Mat gdqb_d(1, 2, CV_64FC(7));
+    gdqb_d.at<Vec<double, 7>>(0, 0) = Vec<double, 7>{1,2,3,4,5,6,7};
+    gdqb_d.at<Vec<double, 7>>(0, 1) = Vec<double, 7>{1,2,3,4,5,6,7};
+    EXPECT_ANY_THROW(DualQuatd::gdqblend(gdqb_d, Vec2d{0.5, 0.5}));
+    Mat gdqb_f(1, 2, CV_32FC(8));
+    gdqb_f.at<Vec<float, 8>>(0, 0) = Vec<float, 8>{1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f};
+    gdqb_f.at<Vec<float, 8>>(0, 1) = Vec<float, 8>{1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f};
+    EXPECT_ANY_THROW(DualQuatd::gdqblend(gdqb_f, Vec2d{0.5, 0.5}));
+    EXPECT_ANY_THROW(DualQuatd::gdqblend(Vec<DualQuatd, 3>{dqIdentity, dq, dq2}, Vec3f{0.4f, 0.f, 0.6f}, QUAT_ASSUME_UNIT));
+    EXPECT_EQ(gdb, DualQuatd::gdqblend(Vec<DualQuatd, 3>{dqIdentity, dq * dualNumber1, -dq2}, Vec3d{0.4, 0, 0.6}));
+}
+
+
+}} // namespace
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 39aaa1edb4..24d35646df 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -364,6 +364,7 @@ CV__DNN_INLINE_NS_BEGIN
          * Inner vector has slice ranges for the first number of input dimensions.
          */
         std::vector<std::vector<Range> > sliceRanges;
+        std::vector<std::vector<int> > sliceSteps;
         int axis;
         int num_split;
 
@@ -499,6 +500,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<PowerLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS ExpLayer : public ActivationLayer
+    {
+    public:
+        float base, scale, shift;
+
+        static Ptr<ExpLayer> create(const LayerParams &params);
+    };
+
     /* Layers used in semantic segmentation */
 
     class CV_EXPORTS CropLayer : public Layer
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 69b71f90ce..0743de00ab 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -100,6 +100,18 @@ CV__DNN_INLINE_NS_BEGIN
     CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
     CV_EXPORTS_W std::vector<Target> getAvailableTargets(dnn::Backend be);
 
+    /**
+     * @brief Enables detailed logging of the DNN model loading with CV DNN API.
+     * @param[in] isDiagnosticsMode Indicates whether diagnostic mode should be set.
+     *
+     * Diagnostic mode provides detailed logging of the model loading stage to explore
+     * potential problems (ex.: not implemented layer type).
+     *
+     * @note In diagnostic mode series of assertions will be skipped, it can lead to the
+     * expected application crash.
+     */
+    CV_EXPORTS void enableModelDiagnostics(bool isDiagnosticsMode);
+
     /** @brief This class provides all data needed to initialize layer.
      *
      * It includes dictionary with scalar params (which can be read by using Dict interface),
@@ -1216,7 +1228,7 @@ CV__DNN_INLINE_NS_BEGIN
       * KeypointsModel creates net from file with trained weights and config,
       * sets preprocessing input, runs forward pass and returns the x and y coordinates of each detected keypoint
       */
-     class CV_EXPORTS_W KeypointsModel: public Model
+     class CV_EXPORTS_W_SIMPLE KeypointsModel: public Model
      {
      public:
          /**
@@ -1248,7 +1260,7 @@ CV__DNN_INLINE_NS_BEGIN
       * SegmentationModel creates net from file with trained weights and config,
       * sets preprocessing input, runs forward pass and returns the class prediction for each pixel.
       */
-     class CV_EXPORTS_W SegmentationModel: public Model
+     class CV_EXPORTS_W_SIMPLE SegmentationModel: public Model
      {
      public:
          /**
@@ -1296,6 +1308,23 @@ CV__DNN_INLINE_NS_BEGIN
           */
          CV_WRAP DetectionModel(const Net& network);
 
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code (need to fix bindings first)
+         DetectionModel();
+
+         /**
+          * @brief nmsAcrossClasses defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so per-class.
+          * This function allows you to toggle this behaviour.
+          * @param[in] value The new value for nmsAcrossClasses
+          */
+         CV_WRAP DetectionModel& setNmsAcrossClasses(bool value);
+
+         /**
+          * @brief Getter for nmsAcrossClasses. This variable defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so only per-class
+          */
+         CV_WRAP bool getNmsAcrossClasses();
+
          /** @brief Given the @p input frame, create input blob, run net and return result detections.
           *  @param[in]  frame  The input image.
           *  @param[out] classIds Class indexes in result detection.
@@ -1309,6 +1338,255 @@ CV__DNN_INLINE_NS_BEGIN
                              float confThreshold = 0.5f, float nmsThreshold = 0.0f);
      };
 
+
+/** @brief This class represents high-level API for text recognition networks.
+ *
+ * TextRecognitionModel allows to set params for preprocessing input image.
+ * TextRecognitionModel creates net from file with trained weights and config,
+ * sets preprocessing input, runs forward pass and return recognition result.
+ * For TextRecognitionModel, CRNN-CTC is supported.
+ */
+class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextRecognitionModel();
+
+    /**
+     * @brief Create Text Recognition model from deep learning network
+     * Call setDecodeType() and setVocabulary() after constructor to initialize the decoding method
+     * @param[in] network Net object
+     */
+    CV_WRAP TextRecognitionModel(const Net& network);
+
+    /**
+     * @brief Create text recognition model from network represented in one of the supported formats
+     * Call setDecodeType() and setVocabulary() after constructor to initialize the decoding method
+     * @param[in] model Binary file contains trained weights
+     * @param[in] config Text file contains network configuration
+     */
+    CV_WRAP inline
+    TextRecognitionModel(const std::string& model, const std::string& config = "")
+        : TextRecognitionModel(readNet(model, config)) { /* nothing */ }
+
+    /**
+     * @brief Set the decoding method of translating the network output into string
+     * @param[in] decodeType The decoding method of translating the network output into string: {'CTC-greedy': greedy decoding for the output of CTC-based methods}
+     */
+    CV_WRAP
+    TextRecognitionModel& setDecodeType(const std::string& decodeType);
+
+    /**
+     * @brief Get the decoding method
+     * @return the decoding method
+     */
+    CV_WRAP
+    const std::string& getDecodeType() const;
+
+    /**
+     * @brief Set the vocabulary for recognition.
+     * @param[in] vocabulary the associated vocabulary of the network.
+     */
+    CV_WRAP
+    TextRecognitionModel& setVocabulary(const std::vector<std::string>& vocabulary);
+
+    /**
+     * @brief Get the vocabulary for recognition.
+     * @return vocabulary the associated vocabulary
+     */
+    CV_WRAP
+    const std::vector<std::string>& getVocabulary() const;
+
+    /**
+     * @brief Given the @p input frame, create input blob, run net and return recognition result
+     * @param[in] frame The input image
+     * @return The text recognition result
+     */
+    CV_WRAP
+    std::string recognize(InputArray frame) const;
+
+    /**
+     * @brief Given the @p input frame, create input blob, run net and return recognition result
+     * @param[in] frame The input image
+     * @param[in] roiRects List of text detection regions of interest (cv::Rect, CV_32SC4). ROIs is be cropped as the network inputs
+     * @param[out] results A set of text recognition results.
+     */
+    CV_WRAP
+    void recognize(InputArray frame, InputArrayOfArrays roiRects, CV_OUT std::vector<std::string>& results) const;
+};
+
+
+/** @brief Base class for text detection networks
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel : public Model
+{
+protected:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel();
+
+public:
+
+    /** @brief Performs detection
+     *
+     * Given the input @p frame, prepare network input, run network inference, post-process network output and return result detections.
+     *
+     * Each result is quadrangle's 4 points in this order:
+     * - bottom-left
+     * - top-left
+     * - top-right
+     * - bottom-right
+     *
+     * Use cv::getPerspectiveTransform function to retrive image region without perspective transformations.
+     *
+     * @note If DL model doesn't support that kind of output then result may be derived from detectTextRectangles() output.
+     *
+     * @param[in] frame The input image
+     * @param[out] detections array with detections' quadrangles (4 points per result)
+     * @param[out] confidences array with detection confidences
+     */
+    CV_WRAP
+    void detect(
+            InputArray frame,
+            CV_OUT std::vector< std::vector<Point> >& detections,
+            CV_OUT std::vector<float>& confidences
+    ) const;
+
+    /** @overload */
+    CV_WRAP
+    void detect(
+            InputArray frame,
+            CV_OUT std::vector< std::vector<Point> >& detections
+    ) const;
+
+    /** @brief Performs detection
+     *
+     * Given the input @p frame, prepare network input, run network inference, post-process network output and return result detections.
+     *
+     * Each result is rotated rectangle.
+     *
+     * @note Result may be inaccurate in case of strong perspective transformations.
+     *
+     * @param[in] frame the input image
+     * @param[out] detections array with detections' RotationRect results
+     * @param[out] confidences array with detection confidences
+     */
+    CV_WRAP
+    void detectTextRectangles(
+            InputArray frame,
+            CV_OUT std::vector<cv::RotatedRect>& detections,
+            CV_OUT std::vector<float>& confidences
+    ) const;
+
+    /** @overload */
+    CV_WRAP
+    void detectTextRectangles(
+            InputArray frame,
+            CV_OUT std::vector<cv::RotatedRect>& detections
+    ) const;
+};
+
+/** @brief This class represents high-level API for text detection DL networks compatible with EAST model.
+ *
+ * Configurable parameters:
+ * - (float) confThreshold - used to filter boxes by confidences, default: 0.5f
+ * - (float) nmsThreshold - used in non maximum suppression, default: 0.0f
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel_EAST : public TextDetectionModel
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel_EAST();
+
+    /**
+     * @brief Create text detection algorithm from deep learning network
+     * @param[in] network Net object
+     */
+    CV_WRAP TextDetectionModel_EAST(const Net& network);
+
+    /**
+     * @brief Create text detection model from network represented in one of the supported formats.
+     * An order of @p model and @p config arguments does not matter.
+     * @param[in] model Binary file contains trained weights.
+     * @param[in] config Text file contains network configuration.
+     */
+    CV_WRAP inline
+    TextDetectionModel_EAST(const std::string& model, const std::string& config = "")
+        : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }
+
+    /**
+     * @brief Set the detection confidence threshold
+     * @param[in] confThreshold A threshold used to filter boxes by confidences
+     */
+    CV_WRAP
+    TextDetectionModel_EAST& setConfidenceThreshold(float confThreshold);
+
+    /**
+     * @brief Get the detection confidence threshold
+     */
+    CV_WRAP
+    float getConfidenceThreshold() const;
+
+    /**
+     * @brief Set the detection NMS filter threshold
+     * @param[in] nmsThreshold A threshold used in non maximum suppression
+     */
+    CV_WRAP
+    TextDetectionModel_EAST& setNMSThreshold(float nmsThreshold);
+
+    /**
+     * @brief Get the detection confidence threshold
+     */
+    CV_WRAP
+    float getNMSThreshold() const;
+};
+
+/** @brief This class represents high-level API for text detection DL networks compatible with DB model.
+ *
+ * Related publications: @cite liao2020real
+ * Paper: https://arxiv.org/abs/1911.08947
+ * For more information about the hyper-parameters setting, please refer to https://github.com/MhLiao/DB
+ *
+ * Configurable parameters:
+ * - (float) binaryThreshold - The threshold of the binary map. It is usually set to 0.3.
+ * - (float) polygonThreshold - The threshold of text polygons. It is usually set to 0.5, 0.6, and 0.7. Default is 0.5f
+ * - (double) unclipRatio - The unclip ratio of the detected text region, which determines the output size. It is usually set to 2.0.
+ * - (int) maxCandidates - The max number of the output results.
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel_DB : public TextDetectionModel
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel_DB();
+
+    /**
+     * @brief Create text detection algorithm from deep learning network.
+     * @param[in] network Net object.
+     */
+    CV_WRAP TextDetectionModel_DB(const Net& network);
+
+    /**
+     * @brief Create text detection model from network represented in one of the supported formats.
+     * An order of @p model and @p config arguments does not matter.
+     * @param[in] model Binary file contains trained weights.
+     * @param[in] config Text file contains network configuration.
+     */
+    CV_WRAP inline
+    TextDetectionModel_DB(const std::string& model, const std::string& config = "")
+        : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }
+
+    CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
+    CV_WRAP float getBinaryThreshold() const;
+
+    CV_WRAP TextDetectionModel_DB& setPolygonThreshold(float polygonThreshold);
+    CV_WRAP float getPolygonThreshold() const;
+
+    CV_WRAP TextDetectionModel_DB& setUnclipRatio(double unclipRatio);
+    CV_WRAP double getUnclipRatio() const;
+
+    CV_WRAP TextDetectionModel_DB& setMaxCandidates(int maxCandidates);
+    CV_WRAP int getMaxCandidates() const;
+};
+
 //! @}
 CV__DNN_INLINE_NS_END
 }
diff --git a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
index d6809ce3fd..8312a418f3 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
@@ -247,6 +247,7 @@ inline DictValue & DictValue::operator=(const DictValue &r)
 }
 
 inline DictValue::DictValue(const DictValue &r)
+    : pv(NULL)
 {
     type = r.type;
 
diff --git a/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp b/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
new file mode 100644
index 0000000000..46a58f09bc
--- /dev/null
+++ b/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_LAYER_REG_HPP
+#define OPENCV_DNN_LAYER_REG_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+//! Register layer types of DNN model.
+typedef std::map<std::string, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
+LayerFactory_Impl& getLayerFactoryImpl();
+
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
index 5b8d953c1a..4c610f6cef 100644
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -205,24 +205,54 @@ static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
     return out;
 }
 
-inline int clamp(int ax, int dims)
+/// @brief Converts axis from `[-dims; dims)` (similar to Python's slice notation) to `[0; dims)` range.
+static inline
+int normalize_axis(int axis, int dims)
 {
-    return ax < 0 ? ax + dims : ax;
+    CV_Check(axis, axis >= -dims && axis < dims, "");
+    axis = (axis < 0) ? (dims + axis) : axis;
+    CV_DbgCheck(axis, axis >= 0 && axis < dims, "");
+    return axis;
 }
 
-inline int clamp(int ax, const MatShape& shape)
+static inline
+int normalize_axis(int axis, const MatShape& shape)
 {
-    return clamp(ax, (int)shape.size());
+    return normalize_axis(axis, (int)shape.size());
 }
 
-inline Range clamp(const Range& r, int axisSize)
+static inline
+Range normalize_axis_range(const Range& r, int axisSize)
 {
-    Range clamped(std::max(r.start, 0),
+    if (r == Range::all())
+        return Range(0, axisSize);
+    CV_CheckGE(r.start, 0, "");
+    Range clamped(r.start,
                   r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1);
-    CV_Assert_N(clamped.start < clamped.end, clamped.end <= axisSize);
+    CV_DbgCheckGE(clamped.start, 0, "");
+    CV_CheckLT(clamped.start, clamped.end, "");
+    CV_CheckLE(clamped.end, axisSize, "");
     return clamped;
 }
 
+static inline
+bool isAllOnes(const MatShape &inputShape, int startPos, int endPos)
+{
+    CV_Assert(!inputShape.empty());
+
+    CV_CheckGE((int) inputShape.size(), startPos, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLE((size_t)endPos, inputShape.size(), "");
+
+    for (size_t i = startPos; i < endPos; i++)
+    {
+        if (inputShape[i] != 1)
+            return false;
+    }
+    return true;
+}
+
 CV__DNN_INLINE_NS_END
 }
 }
diff --git a/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp b/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp
index 29882b92b0..333b1bfdd2 100644
--- a/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp
+++ b/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp
@@ -49,6 +49,8 @@ CV_EXPORTS_W void resetMyriadDevice();
 #define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2 "Myriad2"
 /// Intel(R) Neural Compute Stick 2, NCS2 (USB 03e7:2485), MyriadX (https://software.intel.com/ru-ru/neural-compute-stick)
 #define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X "MyriadX"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE "ARM_COMPUTE"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_X86         "X86"
 
 
 /** @brief Returns Inference Engine VPU type.
@@ -57,6 +59,11 @@ CV_EXPORTS_W void resetMyriadDevice();
  */
 CV_EXPORTS_W cv::String getInferenceEngineVPUType();
 
+/** @brief Returns Inference Engine CPU type.
+ *
+ * Specify OpenVINO plugin: CPU or ARM.
+ */
+CV_EXPORTS_W cv::String getInferenceEngineCPUType();
 
 /** @brief Release a HDDL plugin.
  */
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index 7dc2786906..1cd0b8f486 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20201117
+#define OPENCV_DNN_API_VERSION 20210301
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn5_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/dnn/misc/python/test/test_dnn.py b/modules/dnn/misc/python/test/test_dnn.py
index 746dabf4ea..d0687ca4bc 100644
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@@ -197,6 +197,25 @@ class dnn_test(NewOpenCVTests):
         normAssert(self, out, ref)
 
 
+    def test_textdetection_model(self):
+        img_path = self.find_dnn_file("dnn/text_det_test1.png")
+        weights = self.find_dnn_file("dnn/onnx/models/DB_TD500_resnet50.onnx", required=False)
+        if weights is None:
+            raise unittest.SkipTest("Missing DNN test files (onnx/models/DB_TD500_resnet50.onnx). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        frame = cv.imread(img_path)
+        scale = 1.0 / 255.0
+        size = (736, 736)
+        mean = (122.67891434, 116.66876762, 104.00698793)
+
+        model = cv.dnn_TextDetectionModel_DB(weights)
+        model.setInputParams(scale, size, mean)
+        out, _ = model.detect(frame)
+
+        self.assertTrue(type(out) == list)
+        self.assertTrue(np.array(out).shape == (2, 4, 2))
+
+
     def test_face_detection(self):
         proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt')
         model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=False)
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index aef3bc2c31..46db47bc4c 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -206,7 +206,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
         throw SkipTestException("Test is disabled in OpenVINO 2020.4");
 #endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021010000)  // nGraph compilation failure
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)  // nGraph compilation failure
     if (target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
 #endif
@@ -241,7 +241,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021010000)  // nGraph compilation failure
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)  // nGraph compilation failure
     if (target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
 #endif
@@ -276,9 +276,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         throw SkipTestException("Test is disabled in OpenVINO 2019R2");
 #endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021010000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("Test is disabled in OpenVINO 2021.1 / MYRIAD");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+    if (target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("Test is disabled in OpenVINO 2021.1+ / MYRIAD");
 #endif
     if (backend == DNN_BACKEND_HALIDE ||
         (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
diff --git a/modules/dnn/src/caffe/opencv-caffe.proto b/modules/dnn/src/caffe/opencv-caffe.proto
index 8ab35bac99..d540591f82 100644
--- a/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/modules/dnn/src/caffe/opencv-caffe.proto
@@ -181,6 +181,8 @@ message DetectionOutputParameter {
   optional float confidence_threshold = 9;
   // If prior boxes are normalized to [0, 1] or not.
   optional bool normalized_bbox = 10 [default = true];
+  // OpenCV custom parameter
+  optional bool clip = 1000 [default = false];
 }
 
 message Datum {
diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu
index 6a991baea2..599d58852e 100644
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@@ -145,6 +145,11 @@ void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale,
     generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
 }
 
+template <class T>
+void exp(const Stream& stream, Span<T> output, View<T> input, T normScale, T normShift) {
+    generic_op<T, ExpFunctor<T>>(stream, output, input, {normScale, normShift});
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
 template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
@@ -156,6 +161,7 @@ template void elu<__half>(const Stream&, Span<__half>, View<__half>);
 template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
 template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 #endif
 
 
@@ -169,6 +175,7 @@ template void elu<float>(const Stream&, Span<float>, View<float>);
 template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
 template void bnll<float>(const Stream&, Span<float>, View<float>);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
+template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
 
 template <class T, std::size_t N> static
 void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
index 0435cb294f..1c29de0426 100644
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@@ -228,6 +228,25 @@ struct PowerFunctor {
     T exp, scale, shift;
 };
 
+template <class T>
+struct ExpFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : normScale(1), normShift(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T nScale_, T nShift_) : normScale(nScale_), normShift(nShift_) { }
+        T normScale, normShift;
+    };
+
+    CUDA4DNN_DEVICE ExpFunctor() : ExpFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ExpFunctor(const Params& params) : normScale{params.normScale}, normShift{params.normShift} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::fast_exp;
+        return fast_exp(normShift + normScale * value);
+    }
+
+    T normScale, normShift;
+};
+
 template <class T>
 struct MaxFunctor {
     struct Params {
@@ -297,4 +316,4 @@ struct DivFunctor {
 
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
-#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
\ No newline at end of file
+#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp
index 1a9b221896..273f3fe98e 100644
--- a/modules/dnn/src/cuda/math.hpp
+++ b/modules/dnn/src/cuda/math.hpp
@@ -108,6 +108,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
 
     template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
 
+    template <class T> __device__ long lround(T value);
+    template <> inline __device__ long lround(double value) { return ::lround(value); }
+    template <> inline __device__ long lround(float value) { return lroundf(value); }
+
     template <class T> __device__ T round(T value);
     template <> inline __device__ double round(double value) { return ::round(value); }
     template <> inline __device__ float round(float value) { return roundf(value); }
diff --git a/modules/dnn/src/cuda/max_unpooling.cu b/modules/dnn/src/cuda/max_unpooling.cu
index fbfb5ae432..3bfd75f926 100644
--- a/modules/dnn/src/cuda/max_unpooling.cu
+++ b/modules/dnn/src/cuda/max_unpooling.cu
@@ -31,7 +31,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
     namespace raw {
         template <class T, std::size_t Order,
-        typename std::enable_if<Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
+        typename std::enable_if<Order == 1 || Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
         __global__ void max_pooling_with_indices(
             Span<T> output, Span<T> indices, View<T> input, size_type channels,
             array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
@@ -72,7 +72,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
                     in_spatial_size *= in_spatial_dims[i];
 
                 const auto outer_offset =  (n * channels + c) * in_spatial_size;
-                if (Order == 2) {
+                if (Order == 1) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        index_type offset = 0;
+                        index_type stride = 1;
+                        for (int i = Order - 1; i >= 0; i--) {
+                            offset += stride * idx[i];
+                            stride *= in_spatial_dims[i];
+                        }
+
+                        if (input[outer_offset + offset] > max_value) {
+                            max_idx = offset;
+                            max_value = input[outer_offset + offset];
+                        }
+                    }
+                } else if (Order == 2) {
                     array<index_type, Order> idx;
                     for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
                         for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
@@ -206,8 +221,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
             out_spatial_dims[i] = output.get_axis_size(2 + i);
         }
 
-        /* only max_pooling2d and max_pooling3d are supported */
-        CV_Assert(2 <= order && order <= 3);
+        CV_Assert(1 <= order && order <= 3);
         std::size_t channels = input.get_axis_size(1);
         if (order == 3) {
             launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
@@ -215,6 +229,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         } else if (order == 2) {
             launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
                 out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 1) {
+            launch_max_pooling_kernel<T, 1>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
         }
     }
 
diff --git a/modules/dnn/src/cuda/resize.cu b/modules/dnn/src/cuda/resize.cu
index 045b4f0a87..b780dab9f9 100644
--- a/modules/dnn/src/cuda/resize.cu
+++ b/modules/dnn/src/cuda/resize.cu
@@ -26,7 +26,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         template <class T, std::size_t CHANNELS_PER_ITER>
         __global__ void resize_nn(
             Span<T> output, size_type out_height, size_type out_width,
-            View<T> input, size_type in_height, size_type in_width)
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers)
         {
             auto in_image_size = in_height * in_width;
             auto out_image_size = out_height * out_width;
@@ -60,12 +61,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
                 const index_type y = (iter % out_image_size) / out_width;
                 const index_type x = iter % out_width;
 
-                /* o2i = output to input */
-                auto o2i_fy = static_cast<float>(in_height) / out_height;
-                auto o2i_fx = static_cast<float>(in_width) / out_width;
+                auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy;
+                auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx;
 
-                auto in_y = static_cast<index_type>(y * o2i_fy);
-                auto in_x = static_cast<index_type>(x * o2i_fx);
+                using device::lround;
+                index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf);
+                index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf);
+
+                using device::min;
+                in_y = min(in_y, in_height - 1);
+                in_x = min(in_x, in_width - 1);
 
                 index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
                 index_type out_idx = c_start * out_image_size + y * out_width + x;
@@ -83,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         __global__ void resize_bilinear(
             Span<T> output, size_type out_height, size_type out_width,
             View<T> input, size_type in_height, size_type in_width,
-            float o2i_fy, float o2i_fx)
+            float o2i_fy, float o2i_fx, bool half_pixel_centers)
         {
             auto in_image_size = in_height * in_width;
             auto out_image_size = out_height * out_width;
@@ -119,8 +124,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
                 const index_type y = (iter % out_image_size) / out_width;
                 const index_type x = iter % out_width;
 
-                auto in_x = x * o2i_fx;
-                auto in_y = y * o2i_fy;
+                using device::max;
+                auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx;
+                auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy;
 
                 auto in_x0 = static_cast<index_type>(in_x);
                 auto in_y0 = static_cast<index_type>(in_y);
@@ -157,15 +163,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T, std::size_t CHANNELS_PER_ITER> static
     void launch_multichannel_resize_nn(const Stream& stream,
         Span<T> output, size_type out_height, size_type out_width,
-        View<T> input, size_type in_height, size_type in_width)
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x, bool round, bool half_pixel_centers)
     {
         auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
         auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
-        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width,  scale_y, scale_x, round, half_pixel_centers);
     }
 
     template <class T>
-    void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) {
+    void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) {
         auto out_height = output.get_axis_size(-2);
         auto out_width = output.get_axis_size(-1);
 
@@ -176,38 +183,38 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         auto num_iters = num_effective_channels * out_height * out_width;
 
         if (num_effective_channels % 32 == 0 && num_iters > 655360) {
-            launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         } else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
-            launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         } else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
-            launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         } else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
-            launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         } else if (num_effective_channels % 2 == 0) {
-            launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         } else {
-            launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width);
+            launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
         }
     }
 
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
-    template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>);
+    template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool);
 #endif
-    template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>);
+    template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool);
 
     template <class T, std::size_t CHANNELS_PER_ITER> static
     void launch_multichannel_resize_bilinear(const Stream& stream,
         Span<T> output, size_type out_height, size_type out_width,
         View<T> input, size_type in_height, size_type in_width,
-        float scale_y, float scale_x)
+        float scale_y, float scale_x, bool half_pixel_centers)
     {
         auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
         auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
-        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
     }
 
     template <class T>
-    void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) {
+    void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) {
         auto out_height = output.get_axis_size(-2);
         auto out_width = output.get_axis_size(-1);
 
@@ -218,21 +225,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         auto num_iters = num_effective_channels * out_height * out_width;
 
         if (num_effective_channels % 16 == 0 && num_iters > 163840) {
-            launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+            launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
         } else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
-            launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+            launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
         } else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
-            launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+            launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
         } else if (num_effective_channels % 2 == 0) {
-            launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+            launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
         } else {
-            launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+            launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
         }
     }
 
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
-    template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float);
+    template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool);
 #endif
-    template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float);
+    template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool);
 
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
diff --git a/modules/dnn/src/cuda4dnn/init.hpp b/modules/dnn/src/cuda4dnn/init.hpp
index e9d997311f..f5bb7714f8 100644
--- a/modules/dnn/src/cuda4dnn/init.hpp
+++ b/modules/dnn/src/cuda4dnn/init.hpp
@@ -17,28 +17,18 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
     void checkVersions()
     {
-        int cudart_version = 0;
-        CUDA4DNN_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version));
-        if (cudart_version != CUDART_VERSION)
+        // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#programming-model
+        // cuDNN API Compatibility
+        // Beginning in cuDNN 7, the binary compatibility of a patch and minor releases is maintained as follows:
+        //     Any patch release x.y.z is forward or backward-compatible with applications built against another cuDNN patch release x.y.w (meaning, of the same major and minor version number, but having w!=z).
+        //     cuDNN minor releases beginning with cuDNN 7 are binary backward-compatible with applications built against the same or earlier patch release (meaning, an application built against cuDNN 7.x is binary compatible with cuDNN library 7.y, where y>=x).
+        //     Applications compiled with a cuDNN version 7.y are not guaranteed to work with 7.x release when y > x.
+        auto cudnn_bversion = cudnnGetVersion();
+        auto cudnn_major_bversion = cudnn_bversion / 1000, cudnn_minor_bversion = cudnn_bversion % 1000 / 100;
+        if (cudnn_major_bversion != CUDNN_MAJOR || cudnn_minor_bversion < CUDNN_MINOR)
         {
             std::ostringstream oss;
-            oss << "CUDART reports version " << cudart_version << " which does not match with the version " << CUDART_VERSION << " with which OpenCV was built";
-            CV_LOG_WARNING(NULL, oss.str().c_str());
-        }
-
-        auto cudnn_version = cudnnGetVersion();
-        if (cudnn_version != CUDNN_VERSION)
-        {
-            std::ostringstream oss;
-            oss << "cuDNN reports version " << cudnn_version << " which does not match with the version " << CUDNN_VERSION << " with which OpenCV was built";
-            CV_LOG_WARNING(NULL, oss.str().c_str());
-        }
-
-        auto cudnn_cudart_version = cudnnGetCudartVersion();
-        if (cudart_version != cudnn_cudart_version)
-        {
-            std::ostringstream oss;
-            oss << "CUDART version " << cudnn_cudart_version << " reported by cuDNN " << cudnn_version << " does not match with the version reported by CUDART " << cudart_version;
+            oss << "cuDNN reports version " << cudnn_major_bversion << "." << cudnn_minor_bversion << " which is not compatible with the version " << CUDNN_MAJOR << "." << CUDNN_MINOR << " with which OpenCV was built";
             CV_LOG_WARNING(NULL, oss.str().c_str());
         }
     }
@@ -57,9 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
     bool isDeviceCompatible()
     {
-        if (getDeviceCount() <= 0)
-            return false;
-
         int device_id = getDevice();
         if (device_id < 0)
             return false;
@@ -80,9 +67,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
     bool doesDeviceSupportFP16()
     {
-        if (getDeviceCount() <= 0)
-            return false;
-
         int device_id = getDevice();
         if (device_id < 0)
             return false;
diff --git a/modules/dnn/src/cuda4dnn/kernels/activations.hpp b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
index 46f697fce3..0a7c9878fb 100644
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@@ -45,6 +45,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
 
+    template <class T>
+    void exp(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T normScale, T normShift);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/kernels/resize.hpp b/modules/dnn/src/cuda4dnn/kernels/resize.hpp
index 31aee3d371..4a3768a70a 100644
--- a/modules/dnn/src/cuda4dnn/kernels/resize.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/resize.hpp
@@ -11,10 +11,10 @@
 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
     template <class T>
-    void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input);
+    void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers);
 
     template <class T>
-    void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x);
+    void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers);
 
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
diff --git a/modules/dnn/src/cuda4dnn/primitives/activation.hpp b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
index fce996a89e..84b95927a3 100644
--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@@ -341,6 +341,36 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         const T exp, scale, shift;
     };
 
+    template <class T>
+    class ExpOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ExpOp(csl::Stream stream_, T nScale_, T nShift_)
+            : stream(std::move(stream_)), normScale{ nScale_ }, normShift{ nShift_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::exp<T>(stream, output, input, normScale, normShift);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        const T normScale, normShift;
+    };
+
 }}} /* namespace cv::dnn::cuda4dnn */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
index 8d788f05dc..12cf97404e 100644
--- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@@ -103,7 +103,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
             const auto groups = config.groups;
 
-            CV_Assert (1 < convolution_order && convolution_order <= 3);
+            CV_Assert (1 <= convolution_order && convolution_order <= 3);
 
             const auto rank = input_shape.size();
             const auto output_feature_maps = output_shape[1];
diff --git a/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp b/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
index 1102dc56fa..fc1002fc4e 100644
--- a/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
@@ -50,13 +50,12 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             window_size = config.window_size;
 
             const auto pooling_order = window_size.size();
-            CV_Assert(pooling_order >= 1);
 
             strides = config.strides;
             CV_Assert(pooling_order == strides.size());
 
-            if (pooling_order != 2 && pooling_order != 3)
-                CV_Error(Error::StsNotImplemented, "Only 2D/3D max-pooling are supported.");
+            if (pooling_order < 1 || pooling_order > 3)
+                CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D max-pooling are supported.");
 
             padding_left.resize(pooling_order);
             if (config.padMode == MaxPoolingConfiguration::PaddingMode::MANUAL)
diff --git a/modules/dnn/src/cuda4dnn/primitives/resize.hpp b/modules/dnn/src/cuda4dnn/primitives/resize.hpp
index 0ac7b94e19..1465aa8867 100644
--- a/modules/dnn/src/cuda4dnn/primitives/resize.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/resize.hpp
@@ -20,14 +20,23 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         BILINEAR
     };
 
+    struct ResizeConfiguration {
+        InterpolationType type;
+        bool align_corners;
+        bool half_pixel_centers;
+    };
+
     template <class T>
     class ResizeOp final : public CUDABackendNode {
     public:
         using wrapper_type = GetCUDABackendWrapperType<T>;
 
-        ResizeOp(csl::Stream stream_, InterpolationType type_, float scaleHeight_, float scaleWidth_)
-            : stream(std::move(stream_)), type{ type_ }, scaleHeight{ scaleHeight_ }, scaleWidth{ scaleWidth_ }
+        ResizeOp(csl::Stream stream_, const ResizeConfiguration& config)
+            : stream(std::move(stream_))
         {
+            type = config.type;
+            align_corners = config.align_corners;
+            half_pixel_centers = config.half_pixel_centers;
         }
 
         void forward(
@@ -44,16 +53,27 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
             auto output = output_wrapper->getSpan();
 
+            const auto compute_scale = [this](std::size_t input_size, std::size_t output_size) {
+                return (align_corners && output_size > 1) ?
+                            static_cast<float>(input_size - 1) / (output_size - 1) :
+                            static_cast<float>(input_size) / output_size;
+            };
+
+            auto out_height = output.get_axis_size(-2), out_width = output.get_axis_size(-1);
+            auto in_height = input.get_axis_size(-2), in_width = input.get_axis_size(-1);
+            float scale_height = compute_scale(in_height, out_height),
+                  scale_width = compute_scale(in_width, out_width);
+
             if (type == InterpolationType::NEAREST_NEIGHBOUR)
-                kernels::resize_nn<T>(stream, output, input);
+                kernels::resize_nn<T>(stream, output, input, scale_height, scale_width, align_corners, half_pixel_centers);
             else if (type == InterpolationType::BILINEAR)
-                kernels::resize_bilinear<T>(stream, output, input, scaleHeight, scaleWidth);
+                kernels::resize_bilinear<T>(stream, output, input, scale_height, scale_width, half_pixel_centers);
         }
 
     private:
         csl::Stream stream;
         InterpolationType type;
-        float scaleHeight, scaleWidth; /* for bilinear interpolation */
+        bool align_corners, half_pixel_centers;
     };
 
 }}} /* namespace cv::dnn::cuda4dnn */
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index c745d5f036..4915538ff7 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -241,6 +241,10 @@ namespace cv {
                     {
                         activation_param.type = "Sigmoid";
                     }
+                    else if (type == "tanh")
+                    {
+                        activation_param.type = "TanH";
+                    }
                     else
                     {
                         CV_Error(cv::Error::StsParseError, "Unsupported activation: " + type);
@@ -554,6 +558,29 @@ namespace cv {
                     fused_layer_names.push_back(last_layer);
                 }
 
+                void setSAM(int from)
+                {
+                    cv::dnn::LayerParams eltwise_param;
+                    eltwise_param.name = "SAM-name";
+                    eltwise_param.type = "Eltwise";
+
+                    eltwise_param.set<std::string>("operation", "prod");
+                    eltwise_param.set<std::string>("output_channels_mode", "same");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("sam_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = eltwise_param.type;
+                    lp.layerParams = eltwise_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
                 void setUpsample(int scaleFactor)
                 {
                     cv::dnn::LayerParams param;
@@ -620,7 +647,7 @@ namespace cv {
                             // read section
                             read_net = false;
                             ++layers_counter;
-                            const size_t layer_type_size = line.find("]") - 1;
+                            const size_t layer_type_size = line.find(']') - 1;
                             CV_Assert(layer_type_size < line.size());
                             std::string layer_type = line.substr(1, layer_type_size);
                             net->layers_cfg[layers_counter]["layer_type"] = layer_type;
@@ -833,6 +860,14 @@ namespace cv {
                         from = from < 0 ? from + layers_counter : from;
                         setParams.setScaleChannels(from);
                     }
+                    else if (layer_type == "sam")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setSAM(from);
+                    }
                     else if (layer_type == "upsample")
                     {
                         int scaleFactor = getParam<int>(layer_params, "stride", 1);
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 0f60a393a5..668cce8fa6 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -63,6 +63,7 @@
 #include <memory>
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/imgproc.hpp>
+#include <opencv2/dnn/layer_reg.private.hpp>
 
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>
@@ -93,6 +94,13 @@ static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN
 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
 
+bool DNN_DIAGNOSTICS_RUN = false;
+
+void enableModelDiagnostics(bool isDiagnosticsMode)
+{
+    DNN_DIAGNOSTICS_RUN = isDiagnosticsMode;
+}
+
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -239,11 +247,10 @@ private:
 #endif
 
 #ifdef HAVE_CUDA
-        if (haveCUDA() && cuda4dnn::isDeviceCompatible())
+        if (haveCUDA())
         {
             backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
-            if (cuda4dnn::doesDeviceSupportFP16())
-                backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
+            backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
         }
 #endif
     }
@@ -1383,11 +1390,12 @@ struct Net::Impl : public detail::NetImplBase
         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
                   preferableTarget == DNN_TARGET_CPU ||
                   preferableTarget == DNN_TARGET_OPENCL);
+#ifdef HAVE_INF_ENGINE
         if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
             preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
             CV_Assert(
-                  preferableTarget == DNN_TARGET_CPU ||
+                  (preferableTarget == DNN_TARGET_CPU && (!isArmComputePlugin() || preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) ||
                   preferableTarget == DNN_TARGET_OPENCL ||
                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
                   preferableTarget == DNN_TARGET_MYRIAD ||
@@ -1395,6 +1403,7 @@ struct Net::Impl : public detail::NetImplBase
                   preferableTarget == DNN_TARGET_FPGA
             );
         }
+#endif
         CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
                   preferableTarget == DNN_TARGET_VULKAN);
         CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
@@ -2099,8 +2108,8 @@ struct Net::Impl : public detail::NetImplBase
             return;
         }
 
-        bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
-                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU);
+        bool supportsCPUFallback = !isArmComputePlugin() && (preferableTarget == DNN_TARGET_CPU ||
+                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU));
 
         // Build Inference Engine networks from sets of layers that support this
         // backend. Split a whole model on several Inference Engine networks if
@@ -2363,6 +2372,9 @@ struct Net::Impl : public detail::NetImplBase
         CV_Assert(preferableBackend == DNN_BACKEND_CUDA);
 
 #ifdef HAVE_CUDA
+        if (!cudaInfo) /* we need to check only once */
+            cuda4dnn::checkVersions();
+
         if (cuda4dnn::getDeviceCount() <= 0)
             CV_Error(Error::StsError, "No CUDA capable device found.");
 
@@ -2373,7 +2385,10 @@ struct Net::Impl : public detail::NetImplBase
             CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
 
         if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
-            CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
+        {
+            CV_LOG_WARNING(NULL, "The selected CUDA device does not support FP16 target; switching to FP32 target.");
+            preferableTarget = DNN_TARGET_CUDA;
+        }
 
         if (!cudaInfo)
         {
@@ -2384,7 +2399,6 @@ struct Net::Impl : public detail::NetImplBase
 
             auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
             cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
-            cuda4dnn::checkVersions();
         }
 
         cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
@@ -2972,7 +2986,7 @@ struct Net::Impl : public detail::NetImplBase
                 // the concatenation optimization is applied with batch_size > 1.
                 // so, for now, we only apply this optimization in the most popular
                 // case batch_size == 1.
-                int axis = clamp(concatLayer->axis, output.dims);
+                int axis = normalize_axis(concatLayer->axis, output.dims);
                 if( output.total(0, axis) == 1 )
                 {
                     size_t i, ninputs = ld.inputBlobsId.size();
@@ -4461,7 +4475,7 @@ string Net::Impl::dump()
             prevNode = itBackend->second;
         }
     }
-    string colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151"};
+    std::vector<string> colors = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff"};
     string backend;
     switch (prefBackend)
     {
@@ -4613,6 +4627,7 @@ string Net::Impl::dump()
             case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16"; colorId = 6; break;
             // don't use default:
         }
+        CV_Assert(colorId < colors.size());
         out << "\\n";  // align center
         out << ((clusterIds.size() == 1)? "\" " : " }\" ");
         out << "fillcolor=\"" << colors[colorId] << "\" ";
@@ -5303,15 +5318,13 @@ static Mutex& getLayerFactoryMutex()
     return *instance;
 }
 
-typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
-
 static LayerFactory_Impl& getLayerFactoryImpl_()
 {
     static LayerFactory_Impl impl;
     return impl;
 }
 
-static LayerFactory_Impl& getLayerFactoryImpl()
+LayerFactory_Impl& getLayerFactoryImpl()
 {
     static LayerFactory_Impl* volatile instance = NULL;
     if (instance == NULL)
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index c646c1fe3a..49717f8513 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -772,8 +772,14 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
 {
     if (m.dims == 4)
         return InferenceEngine::Layout::NCHW;
+    else if (m.dims == 3)
+        return InferenceEngine::Layout::CHW;
     else if (m.dims == 2)
         return InferenceEngine::Layout::NC;
+    else if (m.dims == 1)
+        return InferenceEngine::Layout::C;
+    else if (m.dims == 5)
+        return InferenceEngine::Layout::NCDHW;
     else
         return InferenceEngine::Layout::ANY;
 }
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 570a6ff665..698168817f 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -110,6 +110,7 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(BNLL,           BNLLLayer);
     CV_DNN_REGISTER_LAYER_CLASS(AbsVal,         AbsLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Power,          PowerLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Exp,            ExpLayer);
     CV_DNN_REGISTER_LAYER_CLASS(BatchNorm,      BatchNormLayer);
     CV_DNN_REGISTER_LAYER_CLASS(MaxUnpool,      MaxUnpoolLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Dropout,        BlankLayer);
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 1168755a29..edd9948db1 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -401,7 +401,11 @@ public:
         shape[1] = weights_.total();
         auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
         auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+        auto scale_node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
         auto scale_node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
         auto scale_shift = std::make_shared<ngraph::op::v1::Add>(scale_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
         return Ptr<BackendNode>(new InfEngineNgraphNode(scale_shift));
     }
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 8a0f4a67c6..a950c56167 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -79,7 +79,7 @@ public:
     {
         CV_Assert(inputs.size() > 0);
         outputs.resize(1, inputs[0]);
-        int cAxis = clamp(axis, inputs[0]);
+        int cAxis = normalize_axis(axis, inputs[0]);
 
         int axisSum = 0;
         for (size_t i = 0; i < inputs.size(); i++)
@@ -201,7 +201,7 @@ public:
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
-        int cAxis = clamp(axis, inputs[0].dims);
+        int cAxis = normalize_axis(axis, inputs[0].dims);
         if (padding)
             return false;
 
@@ -255,7 +255,7 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        int cAxis = clamp(axis, inputs[0].dims);
+        int cAxis = normalize_axis(axis, inputs[0].dims);
         Mat& outMat = outputs[0];
 
         if (padding)
@@ -296,7 +296,7 @@ public:
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
-        auto concat_axis = clamp(axis, input_wrapper->getRank());
+        auto concat_axis = normalize_axis(axis, input_wrapper->getRank());
         return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
     }
 #endif
@@ -305,7 +305,7 @@ public:
     {
 #ifdef HAVE_VULKAN
         vkcom::Tensor in = VkComTensor(input[0]);
-        int cAxis = clamp(axis, in.dimNum());
+        int cAxis = normalize_axis(axis, in.dimNum());
         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpConcat(cAxis));
         return Ptr<BackendNode>(new VkComBackendNode(input, op));
 #endif // HAVE_VULKAN
@@ -341,7 +341,7 @@ public:
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::Builder::ConcatLayer ieLayer(name);
-        ieLayer.setAxis(clamp(axis, input->getDims().size()));
+        ieLayer.setAxis(normalize_axis(axis, input->getDims().size()));
         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
@@ -354,7 +354,7 @@ public:
     {
         InferenceEngine::DataPtr data = ngraphDataNode(inputs[0]);
         const int numDims = data->getDims().size();
-        const int cAxis = clamp(axis, numDims);
+        const int cAxis = normalize_axis(axis, numDims);
         std::vector<size_t> maxDims(numDims, 0);
 
         CV_Assert(inputs.size() == nodes.size());
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 02495f45ea..fb57f26511 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -125,6 +125,9 @@ public:
         {
             kernel_size.assign(1, kernel_size[0]);
             strides.assign(1, strides[0]);
+            dilations.assign(1, dilations[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
         }
         CV_Assert(weightShape.dims() == kernel_size.size() + 2);
         for (int i = 0; i < kernel_size.size(); i++) {
@@ -311,8 +314,8 @@ public:
 #ifdef HAVE_CUDA
         if (backendId == DNN_BACKEND_CUDA)
         {
-            /* only convolution 2d and 3d supported */
-            if (ksize == 2 || ksize == 3)
+            /* only 1d, 2d and 3d convolutions supported */
+            if (ksize > 0 && ksize <= 3)
                 return true;
 
             return false;
@@ -321,10 +324,13 @@ public:
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
-            if (ksize == 1)
+            bool isArmTarget = preferableTarget == DNN_TARGET_CPU && isArmComputePlugin();
+            if (isArmTarget && blobs.empty())
                 return false;
+            if (ksize == 1)
+                return isArmTarget;
             if (ksize == 3)
-                return preferableTarget == DNN_TARGET_CPU;
+                return preferableTarget != DNN_TARGET_MYRIAD && !isArmTarget;
             bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
             if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || !isMyriad) && blobs.empty())
                 return false;
@@ -802,7 +808,7 @@ public:
         CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         std::vector<size_t> dims = ieInpNode->get_shape();
-        CV_Assert(dims.size() == 4 || dims.size() == 5);
+        CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
         std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
         if (nodes.size() > 1)
             CV_Assert(ieWeights);  // dynamic_cast should not fail
@@ -840,7 +846,7 @@ public:
         else
         {
             auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                             ngraph::Shape{kernel_shape.size()}, kernel_shape.data());
+                             ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
             ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
         }
 
@@ -875,7 +881,7 @@ public:
             if (nodes.size() == 3)
             {
                 auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                    ngraph::Shape{shape.size()}, shape.data());
+                                    ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
                 bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
             }
             else
@@ -1244,7 +1250,7 @@ public:
                                                              v20*vw20 + v21*vw21 + v22*vw22 + vbias;
                                             if (relu)
                                                 vout = v_select(vout > z, vout, vout*vrc);
-                                            vx_store(outptr + out_j, vout);
+                                            v_store(outptr + out_j, vout);
                                         }
                                     }
                                 #endif
@@ -1597,15 +1603,15 @@ public:
                                     v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2);
                                     v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3);
 
-                                    vs00 += w0*r0;
-                                    vs01 += w0*r1;
-                                    vs02 += w0*r2;
-                                    vs03 += w0*r3;
+                                    vs00 = v_fma(w0, r0, vs00);
+                                    vs01 = v_fma(w0, r1, vs01);
+                                    vs02 = v_fma(w0, r2, vs02);
+                                    vs03 = v_fma(w0, r3, vs03);
 
-                                    vs10 += w1*r0;
-                                    vs11 += w1*r1;
-                                    vs12 += w1*r2;
-                                    vs13 += w1*r3;
+                                    vs10 = v_fma(w1, r0, vs10);
+                                    vs11 = v_fma(w1, r1, vs11);
+                                    vs12 = v_fma(w1, r2, vs12);
+                                    vs13 = v_fma(w1, r3, vs13);
                                 }
                                 s0 += v_reduce_sum4(vs00, vs01, vs02, vs03);
                                 s1 += v_reduce_sum4(vs10, vs11, vs12, vs13);
@@ -1688,16 +1694,7 @@ public:
             umat_blobs.resize(n);
             for (size_t i = 0; i < n; i++)
             {
-                if (use_half)
-                {
-                    Mat matFP32;
-                    convertFp16(inputs[i + 1], matFP32);
-                    matFP32.copyTo(umat_blobs[i]);
-                }
-                else
-                {
-                    inputs[i + 1].copyTo(umat_blobs[i]);
-                }
+                inputs[i + 1].copyTo(umat_blobs[i]);
             }
             inputs.resize(1);
         }
@@ -1708,7 +1705,10 @@ public:
             umat_blobs.resize(n);
             for (size_t i = 0; i < n; i++)
             {
-                blobs[i].copyTo(umat_blobs[i]);
+                if (use_half)
+                    convertFp16(blobs[i], umat_blobs[i]);
+                else
+                    blobs[i].copyTo(umat_blobs[i]);
             }
         }
 
@@ -1764,14 +1764,20 @@ public:
 
         if (fusedWeights)
         {
-            weightsMat.copyTo(umat_blobs[0]);
+            if (use_half)
+                convertFp16(weightsMat, umat_blobs[0]);
+            else
+                weightsMat.copyTo(umat_blobs[0]);
             fusedWeights = false;
         }
         if (fusedBias)
         {
             if ( umat_blobs.size() < 2 )
                 umat_blobs.resize(2);
-            umat_blobs[1] = UMat(biasvec, true);
+            if (use_half)
+                convertFp16(Mat(biasvec, true), umat_blobs[1]);
+            else
+                Mat(biasvec, true).copyTo(umat_blobs[1]);
             convolutionOp->setBias(true);
             fusedBias = false;
         }
@@ -2001,6 +2007,21 @@ public:
         const auto groups = input_feature_maps / input_feature_maps_per_group;
 
         ConvolutionConfiguration config;
+
+        if (input_shape.size() == 3)
+        {
+            // Conv1D
+            // We add an extra dim for input and output tensors, because CuDNN doesn't support convolution with 3D tensors
+            input_shape.insert(std::end(input_shape) - 1, 1);
+            output_shape.insert(std::end(output_shape) - 1, 1);
+
+            // Do the similar thing for the other parameters
+            pads_begin.insert(std::begin(pads_begin), 0);
+            pads_end.insert(std::begin(pads_end), 0);
+            strides.insert(std::begin(strides), 1);
+            dilations.insert(std::begin(dilations), 1);
+            kernel_size.insert(std::begin(kernel_size), 1);
+        }
         config.kernel_size.assign(std::begin(kernel_size), std::end(kernel_size));
         config.dilations.assign(std::begin(dilations), std::end(dilations));
         config.strides.assign(std::begin(strides), std::end(strides));
@@ -2365,20 +2386,21 @@ public:
 
                     for( ; n <= nmax - 4; n += 4 )
                     {
+                        v_float32x4 d0 = v_load(dst0 + n);
+                        v_float32x4 d1 = v_load(dst1 + n);
                         v_float32x4 b0 = v_load(bptr0 + n);
                         v_float32x4 b1 = v_load(bptr1 + n);
                         v_float32x4 b2 = v_load(bptr2 + n);
                         v_float32x4 b3 = v_load(bptr3 + n);
-                        v_float32x4 d0 = v_load(dst0 + n);
-                        v_float32x4 d1 = v_load(dst1 + n);
-                        d0 += b0*a00;
-                        d1 += b0*a01;
-                        d0 += b1*a10;
-                        d1 += b1*a11;
-                        d0 += b2*a20;
-                        d1 += b2*a21;
-                        d0 += b3*a30;
-                        d1 += b3*a31;
+                        // TODO try to improve pipeline width
+                        d0 = v_fma(b0, a00, d0);
+                        d1 = v_fma(b0, a01, d1);
+                        d0 = v_fma(b1, a10, d0);
+                        d1 = v_fma(b1, a11, d1);
+                        d0 = v_fma(b2, a20, d0);
+                        d1 = v_fma(b2, a21, d1);
+                        d0 = v_fma(b3, a30, d0);
+                        d1 = v_fma(b3, a31, d1);
                         v_store(dst0 + n, d0);
                         v_store(dst1 + n, d1);
                     }
@@ -2386,8 +2408,10 @@ public:
 
                     for( ; n < nmax; n++ )
                     {
-                        float b0 = bptr0[n], b1 = bptr1[n];
-                        float b2 = bptr2[n], b3 = bptr3[n];
+                        float b0 = bptr0[n];
+                        float b1 = bptr1[n];
+                        float b2 = bptr2[n];
+                        float b3 = bptr3[n];
                         float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
                         float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
                         dst0[n] = d0;
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 40556191f5..de97c873af 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -138,6 +138,12 @@ public:
 
     typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
 
+    inline int getNumOfTargetClasses() {
+        unsigned numBackground =
+            (_backgroundLabelId >= 0 && _backgroundLabelId < _numClasses) ? 1 : 0;
+        return (_numClasses - numBackground);
+    }
+
     bool getParameterDict(const LayerParams &params,
                           const std::string &parameterName,
                           DictValue& result)
@@ -590,12 +596,13 @@ public:
             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
             if (label_bboxes == decodeBBoxes.end())
                 CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+            int limit = (getNumOfTargetClasses() == 1) ? _keepTopK : std::numeric_limits<int>::max();
             if (_bboxesNormalized)
                 NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
-                         indices[c], util::caffe_norm_box_overlap);
+                         indices[c], util::caffe_norm_box_overlap, limit);
             else
                 NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
-                         indices[c], util::caffe_box_overlap);
+                         indices[c], util::caffe_box_overlap, limit);
             numDetections += indices[c].size();
         }
         if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
@@ -617,8 +624,13 @@ public:
                 }
             }
             // Keep outputs k results per image.
-            std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
-                      util::SortScorePairDescend<std::pair<int, int> >);
+            if ((_keepTopK * 8) > scoreIndexPairs.size()) {
+                std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
+                          util::SortScorePairDescend<std::pair<int, int> >);
+            } else {
+                std::partial_sort(scoreIndexPairs.begin(), scoreIndexPairs.begin() + _keepTopK, scoreIndexPairs.end(),
+                          util::SortScorePairDescend<std::pair<int, int> >);
+            }
             scoreIndexPairs.resize(_keepTopK);
 
             std::map<int, std::vector<int> > newIndices;
@@ -853,16 +865,16 @@ public:
         for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
         {
             LabelBBox& labelBBox = locPreds[i];
+            int start = shareLocation ? -1 : 0;
+            for (int c = 0; c < numLocClasses; ++c) {
+                labelBBox[start++].resize(numPredsPerClass);
+            }
             for (int p = 0; p < numPredsPerClass; ++p)
             {
                 int startIdx = p * numLocClasses * 4;
                 for (int c = 0; c < numLocClasses; ++c)
                 {
                     int label = shareLocation ? -1 : c;
-                    if (labelBBox.find(label) == labelBBox.end())
-                    {
-                        labelBBox[label].resize(numPredsPerClass);
-                    }
                     util::NormalizedBBox& bbox = labelBBox[label][p];
                     if (locPredTransposed)
                     {
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index ed87a3e2fc..9bb5be342f 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -1354,11 +1354,15 @@ struct PowerFunctor : public BaseFunctor
                                                                  ngraph::Shape{1}, &scale);
         auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
                                                                  ngraph::Shape{1}, &shift);
-        auto power_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &power);
 
         auto mul = std::make_shared<ngraph::op::v1::Multiply>(scale_node, node, ngraph::op::AutoBroadcastType::NUMPY);
         auto scale_shift = std::make_shared<ngraph::op::v1::Add>(mul, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+
+        if (power == 1)
+            return scale_shift;
+
+        auto power_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                                 ngraph::Shape{1}, &power);
         return std::make_shared<ngraph::op::v1::Power>(scale_shift, power_node, ngraph::op::AutoBroadcastType::NUMPY);
     }
 #endif  // HAVE_DNN_NGRAPH
@@ -1400,6 +1404,120 @@ struct PowerFunctor : public BaseFunctor
     int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; }
 };
 
+struct ExpFunctor : public BaseFunctor
+{
+    typedef ExpLayer Layer;
+    float base, scale, shift;
+    float normScale, normShift;
+
+    ExpFunctor(float base_ = -1.f, float scale_ = 1.f, float shift_ = 0.f)
+        : base(base_), scale(scale_), shift(shift_)
+    {
+        // For base > 0 :
+        // y     = base^(scale * input + shift)
+        // ln(y) = ln(base)*(scale * input + shift)
+        // y     = exp((ln(base)*scale) * input + (ln(base)*shift))
+        // y     = exp(normalized_scale * input + normalized_shift)
+        CV_Check(base, base == -1.f || base > 0.f, "Unsupported 'base' value");
+        const float ln_base = (base == -1.f) ? 1.f : log(base);
+        normScale = scale * ln_base;
+        normShift = shift * ln_base;
+    }
+
+    bool supportBackend(int backendId, int targetId)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    {
+        float a = normScale, b = normShift;
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = exp(a*x + b);
+            }
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("ExpForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            kernel.set(3, (float)normScale);
+            kernel.set(4, (float)normShift);
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+        return true;
+    }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::ExpOp>(target, stream, normScale, normShift);
+    }
+#endif
+
+#ifdef HAVE_HALIDE
+    void attachHalide(const Halide::Expr& input, Halide::Func& top)
+    {
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        top(x, y, c, n) = exp(normScale * input + normShift);
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+#ifdef HAVE_DNN_NGRAPH
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    {
+        auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                                 ngraph::Shape{1}, &normScale);
+        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                                 ngraph::Shape{1}, &normShift);
+        auto mul = std::make_shared<ngraph::op::v1::Multiply>(scale_node, node, ngraph::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(mul, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+        return std::make_shared<ngraph::op::v0::Exp>(scale_shift);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_VULKAN
+    std::shared_ptr<vkcom::OpBase> initVkCom()
+    {
+        // TODO: add vkcom implementation
+        return std::shared_ptr<vkcom::OpBase>();
+    }
+#endif  // HAVE_VULKAN
+
+    int64 getFLOPSPerElement() const { return 3; }
+};
+
 struct ChannelsPReLUFunctor : public BaseFunctor
 {
     typedef ChannelsPReLULayer Layer;
@@ -1634,6 +1752,20 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
     return l;
 }
 
+Ptr<ExpLayer> ExpLayer::create(const LayerParams& params)
+{
+    float base = params.get<float>("base", -1.0f);
+    float scale = params.get<float>("scale", 1.0f);
+    float shift = params.get<float>("shift", 0.0f);
+    Ptr<ExpLayer> l(new ElementWiseLayer<ExpFunctor>(ExpFunctor(base, scale, shift)));
+    l->setParamsFrom(params);
+    l->base = base;
+    l->scale = scale;
+    l->shift = shift;
+
+    return l;
+}
+
 Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
 {
     CV_Assert(params.blobs.size() == 1);
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 10ce70ff3e..a337c48d9e 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -46,6 +46,7 @@
 #include "../op_halide.hpp"
 #include "../op_inf_engine.hpp"
 #include "../ie_ngraph.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
@@ -97,6 +98,7 @@ public:
         : outputChannels(0)
     {
         setParamsFrom(params);
+        hasVecInput = false;
         op = SUM;
         if (params.has("operation"))
         {
@@ -156,6 +158,9 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
+        if (hasVecInput && ELTWISE_CHANNNELS_SAME)
+            return backendId == DNN_BACKEND_OPENCV;
+
         if (backendId == DNN_BACKEND_CUDA)
         {
             if(channelsModeInput == ELTWISE_CHANNNELS_INPUT_0 || channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
@@ -211,9 +216,6 @@ public:
             {
                 CV_Assert(0 && "Internal error");
             }
-
-            for (size_t j = 2; j < dims; j++)
-                CV_Assert(inputs[0][j] == inputs[i][j]);
         }
 
         channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
@@ -221,9 +223,56 @@ public:
 
         outputs.assign(1, inputs[0]);
         outputs[0][1] = numChannels;
+
+        if (dims > 2)
+        {
+            size_t vecIdx = 0;
+            bool isVecFound = false;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                bool allOnes = isAllOnes(inputs[i], 2, dims);
+                if (!allOnes && !isVecFound)
+                {
+                    vecIdx = i;
+                    isVecFound = true;
+                }
+
+                if (!allOnes && i != vecIdx)
+                {
+                    for (size_t j = 2; j < dims; j++)
+                    {
+                         CV_Assert(inputs[vecIdx][j] == inputs[i][j]);
+                    }
+                }
+            }
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound)
+            {
+                for (size_t j = 2; j < dims; j++)
+                {
+                    outputs[0][j] = inputs[vecIdx][j];
+                }
+            }
+        }
+
         return false;
     }
 
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inpShape = shape(inputs[i].size);
+            if (isAllOnes(inpShape, 2, inputs[i].dims))
+            {
+                hasVecInput = true;
+                return;
+            }
+        }
+    }
 
     class EltwiseInvoker : public ParallelLoopBody
     {
@@ -516,6 +565,9 @@ public:
         if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
             return false;
 
+        if (hasVecInput)
+            return false; // TODO not implemented yet: https://github.com/opencv/opencv/pull/19477
+
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
@@ -616,6 +668,47 @@ public:
 
         CV_Assert(outputs.size() == 1);
         const int nstripes = getNumThreads();
+
+        if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2)
+        {
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                MatShape inpShape = shape(inputs[i].size);
+                bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims);
+
+                if (allOnes)
+                {
+                    Mat tmpInput = inputs[i];
+                    MatShape outShape = shape(outputs[0].size);
+                    size_t xSize = outShape[2];
+                    for (size_t j = 3; j < outShape.size(); j++)
+                        xSize *= outShape[j];
+
+                    int dimVec[3] = {outShape[0], outShape[1], (int) xSize};
+                    std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3);
+                    inputs[i] = Mat(matSizesVec, tmpInput.type());
+
+                    std::vector<int> idx(outShape.size(), 0);
+                    std::vector<int> outIdx(inpShape.size(), 0);
+
+                    for (size_t j = 0; j < outShape[0]; j++)
+                    {
+                        outIdx[0] = idx[0] = j;
+                        for(size_t k = 0; k < outShape[1]; k++)
+                        {
+                            outIdx[1] = idx[1] = k;
+                            for (size_t x = 0; x < xSize; x++)
+                            {
+                                outIdx[2] = x;
+                                inputs[i].at<float>(outIdx.data()) = tmpInput.at<float>(idx.data());
+                            }
+                        }
+                    }
+                    inputs[i] = inputs[i].reshape(0, outShape);
+                }
+            }
+        }
+
         EltwiseInvoker::run(*this,
                             &inputs[0], (int)inputs.size(), outputs[0],
                             nstripes);
@@ -795,6 +888,9 @@ public:
     }
 
     Ptr<ActivationLayer> activ;
+
+private:
+    bool hasVecInput;
 };
 
 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index b5ecd8b8ee..7cf01a14fa 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -89,8 +89,8 @@ public:
         }
 
         int numAxes = inputs[0].size();
-        int startAxis = clamp(_startAxis, numAxes);
-        int endAxis = clamp(_endAxis, numAxes);
+        int startAxis = normalize_axis(_startAxis, numAxes);
+        int endAxis = normalize_axis(_endAxis, numAxes);
 
         CV_Assert(startAxis >= 0);
         CV_Assert(endAxis >= startAxis && endAxis < (int)numAxes);
@@ -120,8 +120,8 @@ public:
         inputs_arr.getMatVector(inputs);
 
         int numAxes = inputs[0].dims;
-        _startAxis = clamp(_startAxis, numAxes);
-        _endAxis = clamp(_endAxis, numAxes);
+        _startAxis = normalize_axis(_startAxis, numAxes);
+        _endAxis = normalize_axis(_endAxis, numAxes);
     }
 
 #ifdef HAVE_OPENCL
@@ -195,8 +195,8 @@ virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inp
         std::vector<size_t> dims = ieInpNode->get_shape();
 
         int numAxes = dims.size();
-        int startAxis = clamp(_startAxis, numAxes);
-        int endAxis = clamp(_endAxis, numAxes);
+        int startAxis = normalize_axis(_startAxis, numAxes);
+        int endAxis = normalize_axis(_endAxis, numAxes);
 
         CV_Assert(startAxis >= 0);
         CV_Assert(endAxis >= startAxis && endAxis < numAxes);
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index f46a02af3f..709420c3ca 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -132,7 +132,7 @@ public:
             CV_CheckEQ(blobs[0].dims, 2, "");
             numOutput = blobs[0].size[0];
             CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
-            cAxis = clamp(axis, inputs[0]);
+            cAxis = normalize_axis(axis, inputs[0]);
         }
 
         MatShape outShape(cAxis + 1);
@@ -245,16 +245,18 @@ public:
             #if CV_SIMD128
                     for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
                     {
-                        v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
-                        v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+                        v_float32x4 vs0 = v_setall_f32(0.f);
+                        v_float32x4 vs1 = v_setall_f32(0.f);
+                        v_float32x4 vs2 = v_setall_f32(0.f);
+                        v_float32x4 vs3 = v_setall_f32(0.f);
 
                         for( k = 0; k < vecsize; k += 4 )
                         {
                             v_float32x4 v = v_load_aligned(sptr + k);
-                            vs0 += v*v_load_aligned(wptr + k);
-                            vs1 += v*v_load_aligned(wptr + wstep + k);
-                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
-                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
+                            vs0 = v_fma(v, v_load_aligned(wptr + k), vs0);
+                            vs1 = v_fma(v, v_load_aligned(wptr + wstep + k), vs1);
+                            vs2 = v_fma(v, v_load_aligned(wptr + wstep*2 + k), vs2);
+                            vs3 = v_fma(v, v_load_aligned(wptr + wstep*3 + k), vs3);
                         }
 
                         v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
@@ -354,7 +356,7 @@ public:
             return true;
         }
 
-        int axisCan = clamp(axis, inputs[0].dims);
+        int axisCan = normalize_axis(axis, inputs[0].dims);
         int numOutput = blobs[0].size[0];
         int innerSize = blobs[0].size[1];
         int outerSize = total(shape(inputs[0]), 0, axisCan);
@@ -475,7 +477,7 @@ public:
 
         if (!blobs.empty())
         {
-            int axisCan = clamp(axis, input[0].dims);
+            int axisCan = normalize_axis(axis, input[0].dims);
             int outerSize = input[0].total(0, axisCan);
 
             for (size_t i = 0; i < input.size(); i++)
@@ -523,7 +525,7 @@ public:
 
         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
 
-        auto flatten_start_axis = clamp(axis, input_wrapper->getRank());
+        auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
 
         auto biasMat_ = bias ? biasMat : Mat();
         return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, weightsMat, biasMat_);
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index db986bc897..783949d4cd 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -403,7 +403,15 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
         auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
+#else
+        int64_t start_axis = acrossChannels ? 1 : 2;
+        std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
+        std::iota(axes_v.begin(), axes_v.end(), start_axis);
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
+        auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
+#endif
         return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index a979fdedb6..24559543e1 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -126,8 +126,8 @@ public:
 
         const UMat& inp0 = inputs[0];
         UMat& buffer = internals[0];
-        startAxis = clamp(startAxis, inp0.dims);
-        endAxis = clamp(endAxis, inp0.dims);
+        startAxis = normalize_axis(startAxis, inp0.dims);
+        endAxis = normalize_axis(endAxis, inp0.dims);
 
         size_t num = total(shape(inp0.size), 0, startAxis);
         size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
@@ -211,8 +211,8 @@ public:
 
         const Mat& inp0 = inputs[0];
         Mat& buffer = internals[0];
-        startAxis = clamp(startAxis, inp0.dims);
-        endAxis = clamp(endAxis, inp0.dims);
+        startAxis = normalize_axis(startAxis, inp0.dims);
+        endAxis = normalize_axis(endAxis, inp0.dims);
 
         const float* inpData = inp0.ptr<float>();
         float* outData = outputs[0].ptr<float>();
@@ -334,8 +334,8 @@ public:
         if (!acrossSpatial) {
             axes_data.push_back(1);
         } else {
-            axes_data.resize(ieInpNode->get_shape().size());
-            std::iota(axes_data.begin(), axes_data.end(), 0);
+            axes_data.resize(ieInpNode->get_shape().size() - 1);
+            std::iota(axes_data.begin(), axes_data.end(), 1);
         }
         auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
         auto norm = std::make_shared<ngraph::op::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
@@ -344,19 +344,18 @@ public:
         std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
         shape[0] = blobs.empty() ? 1 : batch;
         shape[1] = numChannels;
-        std::shared_ptr<ngraph::op::Constant> weight;
-        if (blobs.empty())
+        if (!blobs.empty())
         {
-            std::vector<float> ones(numChannels, 1);
-            weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ones.data());
-        }
-        else
-        {
-            weight = std::make_shared<ngraph::op::Constant>(
+            auto weight = std::make_shared<ngraph::op::Constant>(
                                       ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+            auto mul = std::make_shared<ngraph::op::v1::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
+            auto mul = std::make_shared<ngraph::op::v0::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
+            return Ptr<BackendNode>(new InfEngineNgraphNode(mul));
         }
-        auto mul = std::make_shared<ngraph::op::v0::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
-        return Ptr<BackendNode>(new InfEngineNgraphNode(mul));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(norm));
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -378,8 +377,8 @@ public:
 
         NormalizeConfiguration<float> config;
         config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
-        config.axis_start = clamp(startAxis, input_shape.size());
-        config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
+        config.axis_start = normalize_axis(startAxis, input_shape.size());
+        config.axis_end = normalize_axis(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
         config.norm = pnorm;
         config.eps = epsilon;
 
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index b286133419..d182568795 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -105,9 +105,10 @@ public:
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
             bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
-            return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
-                   (!isMyriad ||
-                    (dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0));
+            if (INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) && isMyriad)
+                return dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0;
+
+            return (dstRanges.size() <= 4 || !isArmComputePlugin());
         }
 #endif
         return backendId == DNN_BACKEND_OPENCV ||
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 05f8c380cc..c525c3f82f 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -113,6 +113,10 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && preferableTarget == DNN_TARGET_CPU)
+            return _order.size() <= 4 || !isArmComputePlugin();
+#endif
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_CUDA ||
                ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine()) ||
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 621315a572..b8e2cfdf8f 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -71,6 +71,14 @@ using std::min;
 using namespace cv::dnn::ocl4dnn;
 #endif
 
+#ifdef HAVE_HALIDE
+#if 0  // size_t is not well supported in Halide operations
+typedef size_t HALIDE_DIFF_T;
+#else
+typedef int HALIDE_DIFF_T;
+#endif
+#endif
+
 #ifdef HAVE_CUDA
 #include "../cuda4dnn/primitives/pooling.hpp"
 #include "../cuda4dnn/primitives/roi_pooling.hpp"
@@ -78,6 +86,7 @@ using namespace cv::dnn::ocl4dnn;
 using namespace cv::dnn::cuda4dnn;
 #endif
 
+
 namespace cv
 {
 namespace dnn
@@ -169,14 +178,13 @@ public:
 
         if (inputs[0].dims == 3)
         {
-            //Pool1D
-            kernel_size.erase(kernel_size.begin() + 1);
-            strides.erase(strides.begin() + 1);
-            pads_begin.erase(pads_begin.begin() + 1);
-            pads_end.erase(pads_end.begin() + 1);
+            // Pool1D
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
         }
 
-
 #ifdef HAVE_OPENCL
         poolOp.release();
 #endif
@@ -212,7 +220,9 @@ public:
 #endif
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
-            return !computeMaxIdx && type != STOCHASTIC && kernel_size.size() > 1;
+#ifdef HAVE_DNN_NGRAPH
+            return !computeMaxIdx && type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
+#endif
         }
         else if (backendId == DNN_BACKEND_OPENCV)
         {
@@ -383,6 +393,19 @@ public:
             return make_cuda_node<cuda4dnn::MaxPoolingOp>(preferableTarget, std::move(context->stream), config);
         }
 
+        if (input_shape.size() == 3)
+        {
+            // Pool1D
+            // We add an extra dim for input tensor, because CuDNN support pooling only with 2 and 3 spatial dimensions
+            input_shape.insert(std::end(input_shape) - 1, 1);
+
+            // Do the similar thing for the other parameters
+            pads_begin.insert(std::begin(pads_begin), 0);
+            pads_end.insert(std::begin(pads_end), 0);
+            strides.insert(std::begin(strides), 1);
+            kernel_size.insert(std::begin(kernel_size), 1);
+        }
+
         PoolingConfiguration config;
         if (type == MAX)
         {
@@ -440,9 +463,9 @@ public:
     {
         int padding_mode;
         vkcom::PoolType pool_type;
-        int filter_size[2] = {kernel.height, kernel.width};
-        int pad_size[2] = {pad.height, pad.width};
-        int stride_size[2] = {stride.height, stride.width};
+        int filter_size[2] = {static_cast<int>(kernel_size[0]), static_cast<int>(kernel_size[1])};
+        int pad_size[2] = {static_cast<int>(pads_begin[0]), static_cast<int>(pads_begin[1])};
+        int stride_size[2] = {static_cast<int>(strides[0]), static_cast<int>(strides[1])};
         pool_type = type == MAX ? vkcom::kPoolTypeMax:
                    (type == AVE ? vkcom::kPoolTypeAvg:
                             vkcom::kPoolTypeNum);
@@ -896,7 +919,7 @@ public:
                             if (max_elem!=last)
                             {
                                 dstData[x0] = *max_elem;
-                                if( compMaxIdx )
+                                if( compMaxIdx && dstMaskData )
                                 {
                                     dstMaskData[x0] = std::distance(first, max_elem);
                                 }
@@ -1097,12 +1120,12 @@ public:
         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
         const int inWidth = inputBuffer.width();
         const int inHeight = inputBuffer.height();
-        const size_t kernelHeight = kernel_size[0];
-        const size_t kernelWidth = kernel_size[1];
-        const size_t strideHeight = strides[0];
-        const size_t strideWidth = strides[1];
-        const size_t paddingTop = pads_begin[0];
-        const size_t paddingLeft = pads_begin[1];
+        const HALIDE_DIFF_T kernelHeight = (HALIDE_DIFF_T)kernel_size[0];
+        const HALIDE_DIFF_T kernelWidth = (HALIDE_DIFF_T)kernel_size[1];
+        const HALIDE_DIFF_T strideHeight = (HALIDE_DIFF_T)strides[0];
+        const HALIDE_DIFF_T strideWidth = (HALIDE_DIFF_T)strides[1];
+        const HALIDE_DIFF_T paddingTop = (HALIDE_DIFF_T)pads_begin[0];
+        const HALIDE_DIFF_T paddingLeft = (HALIDE_DIFF_T)pads_begin[1];
 
         Halide::Var x("x"), y("y"), c("c"), n("n");
         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
@@ -1148,10 +1171,10 @@ public:
         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 
         const int inW = inputBuffer.width(), inH = inputBuffer.height();
-        const size_t kernelHeight = kernel_size[0];
-        const size_t kernelWidth = kernel_size[1];
-        const size_t strideHeight = strides[0];
-        const size_t strideWidth = strides[1];
+        const HALIDE_DIFF_T kernelHeight = (HALIDE_DIFF_T)kernel_size[0];
+        const HALIDE_DIFF_T kernelWidth = (HALIDE_DIFF_T)kernel_size[1];
+        const HALIDE_DIFF_T strideHeight = (HALIDE_DIFF_T)strides[0];
+        const HALIDE_DIFF_T strideWidth = (HALIDE_DIFF_T)strides[1];
         if ((inW - kernelWidth) % strideWidth || (inH - kernelHeight) % strideHeight)
         {
             CV_Error(cv::Error::StsNotImplemented,
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index 4658e7b41f..aeb5d44a47 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -54,11 +54,11 @@ public:
             for (int i = 0; i < ratios.size(); ++i)
             {
                 float ratio = ratios.get<float>(i);
+                float width = std::floor(baseSize / sqrt(ratio) + 0.5f);
+                float height = std::floor(width * ratio + 0.5f);
                 for (int j = 0; j < scales.size(); ++j)
                 {
                     float scale = scales.get<float>(j);
-                    float width = std::floor(baseSize / sqrt(ratio) + 0.5f);
-                    float height = std::floor(width * ratio + 0.5f);
                     widths.push_back(scale * width);
                     heights.push_back(scale * height);
                 }
@@ -292,7 +292,8 @@ public:
 
         CV_Assert(imInfo.total() >= 2);
         // We've chosen the smallest data type because we need just a shape from it.
-        fakeImageBlob.create(shape(1, 1, imInfo.at<float>(0), imInfo.at<float>(1)), CV_8UC1);
+        // We don't allocate memory but just need the shape is correct.
+        Mat fakeImageBlob(shape(1, 1, imInfo.at<float>(0), imInfo.at<float>(1)), CV_8UC1, NULL);
 
         // Generate prior boxes.
         std::vector<Mat> layerInputs(2), layerOutputs(1, priorBoxes);
@@ -433,7 +434,6 @@ private:
     Ptr<PermuteLayer> deltasPermute;
     Ptr<PermuteLayer> scoresPermute;
     uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
-    Mat fakeImageBlob;
     float nmsThreshold;
     DictValue ratios, scales;
 #ifdef HAVE_OPENCL
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index 5ddb5342d0..7da211afb0 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -460,8 +460,10 @@ public:
             std::vector<int64_t> mask(anchors, 1);
             region = std::make_shared<ngraph::op::RegionYolo>(tr_input, coords, classes, anchors, useSoftmax, mask, 1, 3, anchors_vec);
 
+            auto tr_shape = tr_input->get_shape();
             auto shape_as_inp = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                                                       ngraph::Shape{tr_input->get_shape().size()}, tr_input->get_shape().data());
+                                                                       ngraph::Shape{tr_shape.size()},
+                                                                       std::vector<int64_t>(tr_shape.begin(), tr_shape.end()));
 
             region = std::make_shared<ngraph::op::v1::Reshape>(region, shape_as_inp, true);
             new_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{4}, std::vector<int64_t>{0, 2, 3, 1});
@@ -607,7 +609,7 @@ public:
         result = std::make_shared<ngraph::op::Transpose>(result, tr_axes);
         if (b > 1)
         {
-            std::vector<size_t> sizes = {(size_t)b, result->get_shape()[0] / b, result->get_shape()[1]};
+            std::vector<int64_t> sizes{b, static_cast<int64_t>(result->get_shape()[0]) / b, static_cast<int64_t>(result->get_shape()[1])};
             auto shape_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{sizes.size()}, sizes.data());
             result = std::make_shared<ngraph::op::v1::Reshape>(result, shape_node, true);
         }
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index 4c603c1ac8..ab8f41c7b6 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -66,14 +66,7 @@ static void computeShapeByReshapeMask(const MatShape &srcShape,
     int srcShapeSize = (int)srcShape.size();
     int maskShapeSize = (int)maskShape.size();
 
-    if (srcRange == Range::all())
-        srcRange = Range(0, srcShapeSize);
-    else
-    {
-        int sz = srcRange.size();
-        srcRange.start = clamp(srcRange.start, srcShapeSize);
-        srcRange.end = srcRange.end == INT_MAX ? srcShapeSize : srcRange.start + sz;
-    }
+    srcRange = normalize_axis_range(srcRange, srcShapeSize);
 
     bool explicitMask = !maskShape.empty();  // All mask values are positive.
     for (int i = 0, n = maskShape.size(); i < n && explicitMask; ++i)
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index a19c2d050f..e872c7f6b0 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -48,6 +48,7 @@ public:
         CV_Check(interpolation, interpolation == "nearest" || interpolation == "opencv_linear" || interpolation == "bilinear", "");
 
         alignCorners = params.get<bool>("align_corners", false);
+        halfPixelCenters = params.get<bool>("half_pixel_centers", false);
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -71,7 +72,7 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         if (backendId == DNN_BACKEND_CUDA)
-            return interpolation == "nearest" || interpolation == "bilinear";
+            return interpolation == "nearest" || interpolation == "bilinear" || interpolation == "opencv_linear";
 
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@@ -123,7 +124,7 @@ public:
 
         Mat& inp = inputs[0];
         Mat& out = outputs[0];
-        if (interpolation == "nearest" || interpolation == "opencv_linear")
+        if ((interpolation == "nearest" && !alignCorners && !halfPixelCenters) || interpolation == "opencv_linear" || (interpolation == "bilinear" && halfPixelCenters))
         {
             InterpolationFlags mode = interpolation == "nearest" ? INTER_NEAREST : INTER_LINEAR;
             for (size_t n = 0; n < inputs[0].size[0]; ++n)
@@ -135,6 +136,54 @@ public:
                 }
             }
         }
+        else if (interpolation == "nearest")
+        {
+            const int inpHeight = inp.size[2];
+            const int inpWidth = inp.size[3];
+            const int inpSpatialSize = inpHeight * inpWidth;
+            const int outSpatialSize = outHeight * outWidth;
+            const int numPlanes = inp.size[0] * inp.size[1];
+            CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+            Mat inpPlanes = inp.reshape(1, numPlanes * inpHeight);
+            Mat outPlanes = out.reshape(1, numPlanes * outHeight);
+
+            float heightOffset = 0.0f;
+            float widthOffset = 0.0f;
+
+            if (halfPixelCenters)
+            {
+                heightOffset = 0.5f * scaleHeight;
+                widthOffset = 0.5f * scaleWidth;
+            }
+
+            for (int y = 0; y < outHeight; ++y)
+            {
+                float input_y = y * scaleHeight + heightOffset;
+                int y0 = halfPixelCenters ? std::floor(input_y) : lroundf(input_y);
+                y0 = std::min(y0, inpHeight - 1);
+
+                const float* inpData_row = inpPlanes.ptr<float>(y0);
+
+                for (int x = 0; x < outWidth; ++x)
+                {
+                    float input_x = x * scaleWidth + widthOffset;
+                    int x0 = halfPixelCenters ? std::floor(input_x) : lroundf(input_x);
+                    x0 = std::min(x0, inpWidth - 1);
+
+                    float* outData = outPlanes.ptr<float>(y, x);
+                    const float* inpData_row_c = inpData_row;
+
+                    for (int c = 0; c < numPlanes; ++c)
+                    {
+                        *outData = inpData_row_c[x0];
+
+                        inpData_row_c += inpSpatialSize;
+                        outData += outSpatialSize;
+                    }
+                }
+            }
+        }
         else if (interpolation == "bilinear")
         {
             const int inpHeight = inp.size[2];
@@ -218,6 +267,7 @@ public:
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
 
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
         ngraph::op::InterpolateAttrs attrs;
         attrs.pads_begin.push_back(0);
         attrs.pads_end.push_back(0);
@@ -236,6 +286,37 @@ public:
         std::vector<int64_t> shape = {outHeight, outWidth};
         auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
         auto interp = std::make_shared<ngraph::op::Interpolate>(ieInpNode, out_shape, attrs);
+#else
+        ngraph::op::v4::Interpolate::InterpolateAttrs attrs;
+
+        if (interpolation == "nearest") {
+            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::nearest;
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::half_pixel;
+        } else if (interpolation == "bilinear") {
+            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear_onnx;
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::asymmetric;
+        } else {
+            CV_Error(Error::StsNotImplemented, format("Unsupported interpolation: %s", interpolation.c_str()));
+        }
+        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::sizes;
+
+        if (alignCorners) {
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::align_corners;
+        }
+
+        attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::round_prefer_floor;
+
+        std::vector<int64_t> shape = {outHeight, outWidth};
+        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+
+        auto& input_shape = ieInpNode->get_shape();
+        CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
+        std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
+        auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());
+
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, 3});
+        auto interp = std::make_shared<ngraph::op::v4::Interpolate>(ieInpNode, out_shape, scales_shape, axes, attrs);
+#endif
         return Ptr<BackendNode>(new InfEngineNgraphNode(interp));
     }
 #endif  // HAVE_DNN_NGRAPH
@@ -250,15 +331,28 @@ public:
     {
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
-        cuda4dnn::InterpolationType itype;
+        cuda4dnn::ResizeConfiguration config;
         if (interpolation == "nearest")
-            itype = InterpolationType::NEAREST_NEIGHBOUR;
+        {
+            config.type = InterpolationType::NEAREST_NEIGHBOUR;
+            config.align_corners = alignCorners;
+            config.half_pixel_centers = halfPixelCenters;
+        }
         else if (interpolation == "bilinear")
-            itype = InterpolationType::BILINEAR;
+        {
+            config.type = InterpolationType::BILINEAR;
+            config.align_corners = alignCorners;
+            config.half_pixel_centers = halfPixelCenters;
+        }
+        else if (interpolation == "opencv_linear")
+        {
+            config.type = InterpolationType::BILINEAR;
+            config.align_corners = false;
+            config.half_pixel_centers = true;
+        }
         else
             CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
-
-        return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), itype, scaleHeight, scaleWidth);
+        return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), config);
     }
 #endif
 
@@ -269,6 +363,7 @@ protected:
     String interpolation;
     float scaleWidth, scaleHeight;
     bool alignCorners;
+    bool halfPixelCenters;
 };
 
 
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index f348b1e5be..a5c268214e 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -305,7 +305,7 @@ public:
             numChannels = blobs[0].total();
 
         std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
-        int cAxis = clamp(axis, shape.size());
+        int cAxis = normalize_axis(axis, shape.size());
         shape[cAxis] = numChannels;
 
         auto node = ieInpNode0;
@@ -314,7 +314,11 @@ public:
             auto weight = blobs.empty() ? ieInpNode1 :
                           std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
 
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+            node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
             node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
         }
         if (hasBias || !hasWeights)
         {
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index fa2d755b71..54e2340387 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -70,6 +70,7 @@ public:
     SliceLayerImpl(const LayerParams& params)
     {
         setParamsFrom(params);
+        hasSteps = false;
         axis = params.get<int>("axis", 1);
         num_split = params.get<int>("num_split", 0);
         hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
@@ -79,7 +80,7 @@ public:
             CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
             const DictValue &indicesValue = params.get("slice_point");
             sliceRanges.resize(indicesValue.size() + 1,
-                               std::vector<Range>(axis + 1, Range::all()));
+                               std::vector<Range>(std::max(axis,0) + 1, Range::all()));
             int prevSlice = 0;
             for (int i = 0; i < indicesValue.size(); ++i)
             {
@@ -118,6 +119,22 @@ public:
                     sliceRanges[0][i].end = end;  // We'll finalize a negative value later.
                 }
             }
+
+            if (params.has("steps"))
+            {
+                const DictValue &steps = params.get("steps");
+                sliceSteps.resize(1);
+                sliceSteps[0].resize(steps.size());
+
+                for (int i = 0; i < steps.size(); ++i)
+                {
+                    int step = steps.get<int>(i);
+                    CV_Assert(step >= 1);
+                    if (step > 1)
+                        hasSteps = true;
+                    sliceSteps[0][i] = step;
+                }
+            }
         }
     }
 
@@ -126,14 +143,17 @@ public:
 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
             return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
-                sliceRanges.size() == 1 && sliceRanges[0].size() == 4;
+                sliceRanges.size() == 1 && sliceRanges[0].size() == 4 && !hasSteps;
 #endif
 #ifdef HAVE_DNN_NGRAPH
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            return sliceRanges.size() == 1;
+            return sliceRanges.size() == 1 && !hasSteps;
 #endif
-        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_CUDA;
+#ifdef HAVE_CUDA
+        if (backendId == DNN_BACKEND_CUDA)
+            return !hasSteps;
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -153,7 +173,10 @@ public:
                 for (int j = 0; j < sliceRanges[i].size(); ++j)
                 {
                     if (shapesInitialized || inpShape[j] > 0)
-                        outputs[i][j] = clamp(sliceRanges[i][j], inpShape[j]).size();
+                        outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size();
+
+                    if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
+                        outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
                 }
             }
         }
@@ -188,6 +211,7 @@ public:
         const MatSize& inpShape = inputs[0].size;
 
         finalSliceRanges = sliceRanges;
+
         if (sliceRanges.empty())
         {
             // Divide input blob on equal parts by axis.
@@ -216,10 +240,13 @@ public:
             // Clamp.
             for (int j = 0; j < finalSliceRanges[i].size(); ++j)
             {
-                finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
+                finalSliceRanges[i][j] = normalize_axis_range(finalSliceRanges[i][j], inpShape[j]);
             }
         }
 
+        if (!sliceSteps.empty() && sliceSteps[0].size() != inputs[0].dims)
+            sliceSteps[0].resize(inputs[0].dims, 1);
+
 #if 0
         std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
         for (int i = 0; i < outputs.size(); ++i)
@@ -427,6 +454,9 @@ public:
     {
         CV_TRACE_FUNCTION();
 
+        if (hasSteps)
+            return false;  // TODO not implemented yet: https://github.com/opencv/opencv/pull/19546
+
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
@@ -485,9 +515,24 @@ public:
 
         const Mat& inpMat = inputs[0];
         CV_Assert(outputs.size() == finalSliceRanges.size());
-        for (size_t i = 0; i < outputs.size(); i++)
+
+        if (!hasSteps)
         {
-            inpMat(finalSliceRanges[i]).copyTo(outputs[i]);
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                inpMat(finalSliceRanges[i]).copyTo(outputs[i]);
+            }
+        }
+        else
+        {
+            int dimsNum = inpMat.dims;
+
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                std::vector<int> inpIdx(dimsNum, 0);
+                std::vector<int> outIdx(dimsNum, 0);
+                getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+            }
         }
     }
 
@@ -603,11 +648,42 @@ public:
 #endif
 
 
+private:
+    void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
+                           const std::vector<Range> &sliceRanges,
+                           const std::vector<int> &sliceSteps, int dim, int dimsNum,
+                           Mat &outputs, std::vector<int> &outIdx)
+    {
+        int begin = sliceRanges[dim].start;
+        int end = sliceRanges[dim].end;
+        int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
+
+        const bool is32F = inpMat.depth() == CV_32F;
+
+        // TODO optimization is required (for 2D tail case at least)
+        for (int k = begin, j = 0; k < end; k += step, j++)
+        {
+            inpIdx[dim] = k;
+            outIdx[dim] = j;
+
+            if (dim + 1 < dimsNum)
+                getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
+            else
+            {
+                if (is32F)
+                    outputs.at<float>(outIdx.data()) = inpMat.at<float>(inpIdx.data());
+                else
+                    outputs.at<short>(outIdx.data()) = inpMat.at<short>(inpIdx.data());  // 16F emulation
+            }
+        }
+    }
+
 protected:
     // The actual non-negative values determined from @p sliceRanges depends on input size.
     std::vector<std::vector<Range> > finalSliceRanges;
     bool hasDynamicShapes;
     bool shapesInitialized;
+    bool hasSteps;
 };
 
 class CropLayerImpl CV_FINAL : public SliceLayerImpl
@@ -634,7 +710,7 @@ public:
         CV_Assert(inputs.size() == 2);
 
         MatShape dstShape = inputs[0];
-        int start = clamp(axis, dstShape);
+        int start = normalize_axis(axis, dstShape);
         for (int i = start; i < dstShape.size(); i++)
         {
             dstShape[i] = inputs[1][i];
@@ -653,7 +729,7 @@ public:
         const Mat &inpSzBlob = inputs[1];
 
         int dims = inpBlob.dims;
-        int start_axis = clamp(axis, dims);
+        int start_axis = normalize_axis(axis, dims);
 
         std::vector<int> offset_final(dims, 0);
         if (offset.size() == 1)
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 6715c86e39..546c1017ad 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -89,7 +89,7 @@ public:
     {
         bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
         MatShape shape = inputs[0];
-        int cAxis = clamp(axisRaw, shape.size());
+        int cAxis = normalize_axis(axisRaw, shape.size());
         shape[cAxis] = 1;
         internals.assign(1, shape);
         return inplace;
@@ -124,7 +124,7 @@ public:
 
         UMat& src = inputs[0];
         UMat& dstMat = outputs[0];
-        int axis = clamp(axisRaw, src.dims);
+        int axis = normalize_axis(axisRaw, src.dims);
 
         if (softmaxOp.empty())
         {
@@ -216,7 +216,7 @@ public:
         const Mat &src = inputs[0];
         Mat &dst = outputs[0];
 
-        int axis = clamp(axisRaw, src.dims);
+        int axis = normalize_axis(axisRaw, src.dims);
         size_t outerSize = src.total(0, axis), channels = src.size[axis],
                 innerSize = src.total(axis + 1);
 
@@ -306,7 +306,7 @@ public:
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
-        auto channel_axis = clamp(axisRaw, input_wrapper->getRank());
+        auto channel_axis = normalize_axis(axisRaw, input_wrapper->getRank());
         return make_cuda_node<cuda4dnn::SoftmaxOp>(preferableTarget, std::move(context->cudnn_handle), channel_axis, logSoftMax);
     }
 #endif
@@ -315,7 +315,7 @@ public:
     {
 #ifdef HAVE_VULKAN
         vkcom::Tensor in = VkComTensor(inputs[0]);
-        int cAxis = clamp(axisRaw, in.dimNum());
+        int cAxis = normalize_axis(axisRaw, in.dimNum());
         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpSoftmax(cAxis, logSoftMax));
         return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
 #endif  // HAVE_VULKAN
@@ -354,7 +354,7 @@ public:
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
-        ieLayer.setAxis(clamp(axisRaw, input->getDims().size()));
+        ieLayer.setAxis(normalize_axis(axisRaw, input->getDims().size()));
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
@@ -365,7 +365,7 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        int axis = clamp(axisRaw, ieInpNode->get_shape().size());
+        int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
         auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
         if (logSoftMax)
             return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
index aefeaa42b3..0af8223a7f 100644
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@@ -4,7 +4,6 @@
 
 #include "precomp.hpp"
 #include <algorithm>
-#include <iostream>
 #include <utility>
 #include <iterator>
 
@@ -37,9 +36,10 @@ public:
     virtual void setPreferableBackend(Backend backendId) { net.setPreferableBackend(backendId); }
     virtual void setPreferableTarget(Target targetId) { net.setPreferableTarget(targetId); }
 
-    /*virtual*/
+    virtual
     void initNet(const Net& network)
     {
+        CV_TRACE_FUNCTION();
         net = network;
 
         outNames = net.getUnconnectedOutLayersNames();
@@ -91,6 +91,7 @@ public:
     /*virtual*/
     void processFrame(InputArray frame, OutputArrayOfArrays outs)
     {
+        CV_TRACE_FUNCTION();
         if (size.empty())
             CV_Error(Error::StsBadSize, "Input size not specified");
 
@@ -103,6 +104,7 @@ public:
             Mat imInfo(Matx13f(size.height, size.width, 1.6f));
             net.setInput(imInfo, "im_info");
         }
+
         net.forward(outs, outNames);
     }
 };
@@ -320,34 +322,78 @@ void SegmentationModel::segment(InputArray frame, OutputArray mask)
     }
 }
 
-void disableRegionNMS(Net& net)
+class DetectionModel_Impl : public Model::Impl
 {
-    for (String& name : net.getUnconnectedOutLayersNames())
+public:
+    virtual ~DetectionModel_Impl() {}
+    DetectionModel_Impl() : Impl() {}
+    DetectionModel_Impl(const DetectionModel_Impl&) = delete;
+    DetectionModel_Impl(DetectionModel_Impl&&) = delete;
+
+    void disableRegionNMS(Net& net)
     {
-        int layerId = net.getLayerId(name);
-        Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>();
-        if (!layer.empty())
+        for (String& name : net.getUnconnectedOutLayersNames())
         {
-            layer->nmsThreshold = 0;
+            int layerId = net.getLayerId(name);
+            Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>();
+            if (!layer.empty())
+            {
+                layer->nmsThreshold = 0;
+            }
         }
     }
-}
+
+    void setNmsAcrossClasses(bool value) {
+        nmsAcrossClasses = value;
+    }
+
+    bool getNmsAcrossClasses() {
+        return nmsAcrossClasses;
+    }
+
+private:
+    bool nmsAcrossClasses = false;
+};
 
 DetectionModel::DetectionModel(const String& model, const String& config)
-    : Model(model, config)
+    : DetectionModel(readNet(model, config))
 {
-    disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+    // nothing
 }
 
-DetectionModel::DetectionModel(const Net& network) : Model(network)
+DetectionModel::DetectionModel(const Net& network) : Model()
 {
-    disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+    impl = makePtr<DetectionModel_Impl>();
+    impl->initNet(network);
+    impl.dynamicCast<DetectionModel_Impl>()->disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+}
+
+DetectionModel::DetectionModel() : Model()
+{
+    // nothing
+}
+
+DetectionModel& DetectionModel::setNmsAcrossClasses(bool value)
+{
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
+    impl.dynamicCast<DetectionModel_Impl>()->setNmsAcrossClasses(value);
+    return *this;
+}
+
+bool DetectionModel::getNmsAcrossClasses()
+{
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
+    return impl.dynamicCast<DetectionModel_Impl>()->getNmsAcrossClasses();
 }
 
 void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
                             CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
                             float confThreshold, float nmsThreshold)
 {
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
     std::vector<Mat> detections;
     impl->processFrame(frame, detections);
 
@@ -413,7 +459,7 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
     {
         std::vector<int> predClassIds;
         std::vector<Rect> predBoxes;
-        std::vector<float> predConf;
+        std::vector<float> predConfidences;
         for (int i = 0; i < detections.size(); ++i)
         {
             // Network produces output blob with a shape NxC where N is a number of
@@ -442,37 +488,51 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
                 height   = std::max(1, std::min(height, frameHeight - top));
 
                 predClassIds.push_back(classIdPoint.x);
-                predConf.push_back(static_cast<float>(conf));
+                predConfidences.push_back(static_cast<float>(conf));
                 predBoxes.emplace_back(left, top, width, height);
             }
         }
 
         if (nmsThreshold)
         {
-            std::map<int, std::vector<size_t> > class2indices;
-            for (size_t i = 0; i < predClassIds.size(); i++)
+            if (getNmsAcrossClasses())
             {
-                if (predConf[i] >= confThreshold)
-                {
-                    class2indices[predClassIds[i]].push_back(i);
-                }
-            }
-            for (const auto& it : class2indices)
-            {
-                std::vector<Rect> localBoxes;
-                std::vector<float> localConfidences;
-                for (size_t idx : it.second)
-                {
-                    localBoxes.push_back(predBoxes[idx]);
-                    localConfidences.push_back(predConf[idx]);
-                }
                 std::vector<int> indices;
-                NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
-                classIds.resize(classIds.size() + indices.size(), it.first);
+                NMSBoxes(predBoxes, predConfidences, confThreshold, nmsThreshold, indices);
                 for (int idx : indices)
                 {
-                    boxes.push_back(localBoxes[idx]);
-                    confidences.push_back(localConfidences[idx]);
+                    boxes.push_back(predBoxes[idx]);
+                    confidences.push_back(predConfidences[idx]);
+                    classIds.push_back(predClassIds[idx]);
+                }
+            }
+            else
+            {
+                std::map<int, std::vector<size_t> > class2indices;
+                for (size_t i = 0; i < predClassIds.size(); i++)
+                {
+                    if (predConfidences[i] >= confThreshold)
+                    {
+                        class2indices[predClassIds[i]].push_back(i);
+                    }
+                }
+                for (const auto& it : class2indices)
+                {
+                    std::vector<Rect> localBoxes;
+                    std::vector<float> localConfidences;
+                    for (size_t idx : it.second)
+                    {
+                        localBoxes.push_back(predBoxes[idx]);
+                        localConfidences.push_back(predConfidences[idx]);
+                    }
+                    std::vector<int> indices;
+                    NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
+                    classIds.resize(classIds.size() + indices.size(), it.first);
+                    for (int idx : indices)
+                    {
+                        boxes.push_back(localBoxes[idx]);
+                        confidences.push_back(localConfidences[idx]);
+                    }
                 }
             }
         }
@@ -480,11 +540,786 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
         {
             boxes       = std::move(predBoxes);
             classIds    = std::move(predClassIds);
-            confidences = std::move(predConf);
+            confidences = std::move(predConfidences);
         }
     }
     else
         CV_Error(Error::StsNotImplemented, "Unknown output layer type: \"" + lastLayer->type + "\"");
 }
 
+struct TextRecognitionModel_Impl : public Model::Impl
+{
+    std::string decodeType;
+    std::vector<std::string> vocabulary;
+
+    TextRecognitionModel_Impl()
+    {
+        CV_TRACE_FUNCTION();
+    }
+
+    TextRecognitionModel_Impl(const Net& network)
+    {
+        CV_TRACE_FUNCTION();
+        initNet(network);
+    }
+
+    inline
+    void setVocabulary(const std::vector<std::string>& inputVoc)
+    {
+        vocabulary = inputVoc;
+    }
+
+    inline
+    void setDecodeType(const std::string& type)
+    {
+        decodeType = type;
+    }
+
+    virtual
+    std::string decode(const Mat& prediction)
+    {
+        CV_TRACE_FUNCTION();
+        CV_Assert(!prediction.empty());
+        if (decodeType.empty())
+            CV_Error(Error::StsBadArg, "TextRecognitionModel: decodeType is not specified");
+        if (vocabulary.empty())
+            CV_Error(Error::StsBadArg, "TextRecognitionModel: vocabulary is not specified");
+
+        std::string decodeSeq;
+        if (decodeType == "CTC-greedy")
+        {
+            CV_CheckEQ(prediction.dims, 3, "");
+            CV_CheckType(prediction.type(), CV_32FC1, "");
+            const int vocLength = (int)(vocabulary.size());
+            CV_CheckLE(prediction.size[1], vocLength, "");
+            bool ctcFlag = true;
+            int lastLoc = 0;
+            for (int i = 0; i < prediction.size[0]; i++)
+            {
+                const float* pred = prediction.ptr<float>(i);
+                int maxLoc = 0;
+                float maxScore = pred[0];
+                for (int j = 1; j < vocLength + 1; j++)
+                {
+                    float score = pred[j];
+                    if (maxScore < score)
+                    {
+                        maxScore = score;
+                        maxLoc = j;
+                    }
+                }
+
+                if (maxLoc > 0)
+                {
+                    std::string currentChar = vocabulary.at(maxLoc - 1);
+                    if (maxLoc != lastLoc || ctcFlag)
+                    {
+                        lastLoc = maxLoc;
+                        decodeSeq += currentChar;
+                        ctcFlag = false;
+                    }
+                }
+                else
+                {
+                    ctcFlag = true;
+                }
+            }
+        } else if (decodeType.length() == 0) {
+            CV_Error(Error::StsBadArg, "Please set decodeType");
+        } else {
+            CV_Error_(Error::StsBadArg, ("Unsupported decodeType: %s", decodeType.c_str()));
+        }
+
+        return decodeSeq;
+    }
+
+    virtual
+    std::string recognize(InputArray frame)
+    {
+        CV_TRACE_FUNCTION();
+        std::vector<Mat> outs;
+        processFrame(frame, outs);
+        CV_CheckEQ(outs.size(), (size_t)1, "");
+        return decode(outs[0]);
+    }
+
+    virtual
+    void recognize(InputArray frame, InputArrayOfArrays roiRects, CV_OUT std::vector<std::string>& results)
+    {
+        CV_TRACE_FUNCTION();
+        results.clear();
+        if (roiRects.empty())
+        {
+            auto s = recognize(frame);
+            results.push_back(s);
+            return;
+        }
+
+        std::vector<Rect> rects;
+        roiRects.copyTo(rects);
+
+        // Predict for each RoI
+        Mat input = frame.getMat();
+        for (size_t i = 0; i < rects.size(); i++)
+        {
+            Rect roiRect = rects[i];
+            Mat roi = input(roiRect);
+            auto s = recognize(roi);
+            results.push_back(s);
+        }
+    }
+
+    static inline
+    TextRecognitionModel_Impl& from(const std::shared_ptr<Model::Impl>& ptr)
+    {
+        CV_Assert(ptr);
+        return *((TextRecognitionModel_Impl*)ptr.get());
+    }
+};
+
+TextRecognitionModel::TextRecognitionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<TextRecognitionModel_Impl>());
+}
+
+TextRecognitionModel::TextRecognitionModel(const Net& network)
+{
+    impl = std::static_pointer_cast<Model::Impl>(std::make_shared<TextRecognitionModel_Impl>(network));
+}
+
+TextRecognitionModel& TextRecognitionModel::setDecodeType(const std::string& decodeType)
+{
+    TextRecognitionModel_Impl::from(impl).setDecodeType(decodeType);
+    return *this;
+}
+
+const std::string& TextRecognitionModel::getDecodeType() const
+{
+    return TextRecognitionModel_Impl::from(impl).decodeType;
+}
+
+TextRecognitionModel& TextRecognitionModel::setVocabulary(const std::vector<std::string>& inputVoc)
+{
+    TextRecognitionModel_Impl::from(impl).setVocabulary(inputVoc);
+    return *this;
+}
+
+const std::vector<std::string>& TextRecognitionModel::getVocabulary() const
+{
+    return TextRecognitionModel_Impl::from(impl).vocabulary;
+}
+
+std::string TextRecognitionModel::recognize(InputArray frame) const
+{
+    return TextRecognitionModel_Impl::from(impl).recognize(frame);
+}
+
+void TextRecognitionModel::recognize(InputArray frame, InputArrayOfArrays roiRects, CV_OUT std::vector<std::string>& results) const
+{
+    TextRecognitionModel_Impl::from(impl).recognize(frame, roiRects, results);
+}
+
+
+///////////////////////////////////////// Text Detection /////////////////////////////////////////
+
+struct TextDetectionModel_Impl : public Model::Impl
+{
+    TextDetectionModel_Impl() {}
+
+    TextDetectionModel_Impl(const Net& network)
+    {
+        CV_TRACE_FUNCTION();
+        initNet(network);
+    }
+
+    virtual
+    std::vector< std::vector<Point2f> > detect(InputArray frame, CV_OUT std::vector<float>& confidences)
+    {
+        CV_TRACE_FUNCTION();
+        std::vector<RotatedRect> rects = detectTextRectangles(frame, confidences);
+        std::vector< std::vector<Point2f> > results;
+        for (const RotatedRect& rect : rects)
+        {
+            Point2f vertices[4] = {};
+            rect.points(vertices);
+            std::vector<Point2f> result = { vertices[0], vertices[1], vertices[2], vertices[3] };
+            results.emplace_back(result);
+        }
+        return results;
+    }
+
+    virtual
+    std::vector< std::vector<Point2f> > detect(InputArray frame)
+    {
+        CV_TRACE_FUNCTION();
+        std::vector<float> confidences;
+        return detect(frame, confidences);
+    }
+
+    virtual
+    std::vector<RotatedRect> detectTextRectangles(InputArray frame, CV_OUT std::vector<float>& confidences)
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+
+    virtual
+    std::vector<cv::RotatedRect> detectTextRectangles(InputArray frame)
+    {
+        CV_TRACE_FUNCTION();
+        std::vector<float> confidences;
+        return detectTextRectangles(frame, confidences);
+    }
+
+    static inline
+    TextDetectionModel_Impl& from(const std::shared_ptr<Model::Impl>& ptr)
+    {
+        CV_Assert(ptr);
+        return *((TextDetectionModel_Impl*)ptr.get());
+    }
+};
+
+
+TextDetectionModel::TextDetectionModel()
+    : Model()
+{
+    // nothing
+}
+
+static
+void to32s(
+        const std::vector< std::vector<Point2f> >& detections_f,
+        CV_OUT std::vector< std::vector<Point> >& detections
+)
+{
+    detections.resize(detections_f.size());
+    for (size_t i = 0; i < detections_f.size(); i++)
+    {
+        const auto& contour_f = detections_f[i];
+        std::vector<Point> contour(contour_f.size());
+        for (size_t j = 0; j < contour_f.size(); j++)
+        {
+            contour[j].x = cvRound(contour_f[j].x);
+            contour[j].y = cvRound(contour_f[j].y);
+        }
+        swap(detections[i], contour);
+    }
+}
+
+void TextDetectionModel::detect(
+        InputArray frame,
+        CV_OUT std::vector< std::vector<Point> >& detections,
+        CV_OUT std::vector<float>& confidences
+) const
+{
+    std::vector< std::vector<Point2f> > detections_f = TextDetectionModel_Impl::from(impl).detect(frame, confidences);
+    to32s(detections_f, detections);
+    return;
+}
+
+void TextDetectionModel::detect(
+        InputArray frame,
+        CV_OUT std::vector< std::vector<Point> >& detections
+) const
+{
+    std::vector< std::vector<Point2f> > detections_f = TextDetectionModel_Impl::from(impl).detect(frame);
+    to32s(detections_f, detections);
+    return;
+}
+
+void TextDetectionModel::detectTextRectangles(
+        InputArray frame,
+        CV_OUT std::vector<cv::RotatedRect>& detections,
+        CV_OUT std::vector<float>& confidences
+) const
+{
+    detections = TextDetectionModel_Impl::from(impl).detectTextRectangles(frame, confidences);
+    return;
+}
+
+void TextDetectionModel::detectTextRectangles(
+        InputArray frame,
+        CV_OUT std::vector<cv::RotatedRect>& detections
+) const
+{
+    detections = TextDetectionModel_Impl::from(impl).detectTextRectangles(frame);
+    return;
+}
+
+
+struct TextDetectionModel_EAST_Impl : public TextDetectionModel_Impl
+{
+    float confThreshold;
+    float nmsThreshold;
+
+    TextDetectionModel_EAST_Impl()
+        : confThreshold(0.5f)
+        , nmsThreshold(0.0f)
+    {
+        CV_TRACE_FUNCTION();
+    }
+
+    TextDetectionModel_EAST_Impl(const Net& network)
+        : TextDetectionModel_EAST_Impl()
+    {
+        CV_TRACE_FUNCTION();
+        initNet(network);
+    }
+
+    void setConfidenceThreshold(float confThreshold_) { confThreshold = confThreshold_; }
+    float getConfidenceThreshold() const { return confThreshold; }
+
+    void setNMSThreshold(float nmsThreshold_) { nmsThreshold = nmsThreshold_; }
+    float getNMSThreshold() const { return nmsThreshold; }
+
+    // TODO: According to article EAST supports quadrangles output: https://arxiv.org/pdf/1704.03155.pdf
+#if 0
+    virtual
+    std::vector< std::vector<Point2f> > detect(InputArray frame, CV_OUT std::vector<float>& confidences) CV_OVERRIDE
+#endif
+
+    virtual
+    std::vector<cv::RotatedRect> detectTextRectangles(InputArray frame, CV_OUT std::vector<float>& confidences) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        std::vector<cv::RotatedRect> results;
+
+        std::vector<Mat> outs;
+        processFrame(frame, outs);
+        CV_CheckEQ(outs.size(), (size_t)2, "");
+        Mat geometry = outs[0];
+        Mat scoreMap = outs[1];
+
+        CV_CheckEQ(scoreMap.dims, 4, "");
+        CV_CheckEQ(geometry.dims, 4, "");
+        CV_CheckEQ(scoreMap.size[0], 1, "");
+        CV_CheckEQ(geometry.size[0], 1, "");
+        CV_CheckEQ(scoreMap.size[1], 1, "");
+        CV_CheckEQ(geometry.size[1], 5, "");
+        CV_CheckEQ(scoreMap.size[2], geometry.size[2], "");
+        CV_CheckEQ(scoreMap.size[3], geometry.size[3], "");
+
+        CV_CheckType(scoreMap.type(), CV_32FC1, "");
+        CV_CheckType(geometry.type(), CV_32FC1, "");
+
+        std::vector<RotatedRect> boxes;
+        std::vector<float> scores;
+        const int height = scoreMap.size[2];
+        const int width = scoreMap.size[3];
+        for (int y = 0; y < height; ++y)
+        {
+            const float* scoresData = scoreMap.ptr<float>(0, 0, y);
+            const float* x0_data = geometry.ptr<float>(0, 0, y);
+            const float* x1_data = geometry.ptr<float>(0, 1, y);
+            const float* x2_data = geometry.ptr<float>(0, 2, y);
+            const float* x3_data = geometry.ptr<float>(0, 3, y);
+            const float* anglesData = geometry.ptr<float>(0, 4, y);
+            for (int x = 0; x < width; ++x)
+            {
+                float score = scoresData[x];
+                if (score < confThreshold)
+                    continue;
+
+                float offsetX = x * 4.0f, offsetY = y * 4.0f;
+                float angle = anglesData[x];
+                float cosA = std::cos(angle);
+                float sinA = std::sin(angle);
+                float h = x0_data[x] + x2_data[x];
+                float w = x1_data[x] + x3_data[x];
+
+                Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x],
+                               offsetY - sinA * x1_data[x] + cosA * x2_data[x]);
+                Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset;
+                Point2f p3 = Point2f(-cosA * w, sinA * w) + offset;
+                boxes.push_back(RotatedRect(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI));
+                scores.push_back(score);
+            }
+        }
+
+        // Apply non-maximum suppression procedure.
+        std::vector<int> indices;
+        NMSBoxes(boxes, scores, confThreshold, nmsThreshold, indices);
+
+        confidences.clear();
+        confidences.reserve(indices.size());
+
+        // Re-scale
+        Point2f ratio((float)frame.cols() / size.width, (float)frame.rows() / size.height);
+        bool isUniformRatio = std::fabs(ratio.x - ratio.y) <= 0.01f;
+        for (uint i = 0; i < indices.size(); i++)
+        {
+            auto idx = indices[i];
+
+            auto conf = scores[idx];
+            confidences.push_back(conf);
+
+            RotatedRect& box0 = boxes[idx];
+
+            if (isUniformRatio)
+            {
+                RotatedRect box = box0;
+                box.center.x *= ratio.x;
+                box.center.y *= ratio.y;
+                box.size.width *= ratio.x;
+                box.size.height *= ratio.y;
+                results.emplace_back(box);
+            }
+            else
+            {
+                Point2f vertices[4] = {};
+                box0.points(vertices);
+                for (int j = 0; j < 4; j++)
+                {
+                    vertices[j].x *= ratio.x;
+                    vertices[j].y *= ratio.y;
+                }
+                RotatedRect box = minAreaRect(Mat(4, 1, CV_32FC2, (void*)vertices));
+
+                // minArea() rect is not normalized, it may return rectangles rotated by +90/-90
+                float angle_diff = std::fabs(box.angle - box0.angle);
+                while (angle_diff >= (90 + 45))
+                {
+                    box.angle += (box.angle < box0.angle) ? 180 : -180;
+                    angle_diff = std::fabs(box.angle - box0.angle);
+                }
+                if (angle_diff > 45)  // avoid ~90 degree turns
+                {
+                    std::swap(box.size.width, box.size.height);
+                    if (box.angle < box0.angle)
+                        box.angle += 90;
+                    else if (box.angle > box0.angle)
+                        box.angle -= 90;
+                }
+                // CV_DbgAssert(std::fabs(box.angle - box0.angle) <= 45);
+
+                results.emplace_back(box);
+            }
+        }
+
+        return results;
+    }
+
+    static inline
+    TextDetectionModel_EAST_Impl& from(const std::shared_ptr<Model::Impl>& ptr)
+    {
+        CV_Assert(ptr);
+        return *((TextDetectionModel_EAST_Impl*)ptr.get());
+    }
+};
+
+
+TextDetectionModel_EAST::TextDetectionModel_EAST()
+    : TextDetectionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<TextDetectionModel_EAST_Impl>());
+}
+
+TextDetectionModel_EAST::TextDetectionModel_EAST(const Net& network)
+    : TextDetectionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<TextDetectionModel_EAST_Impl>(network));
+}
+
+TextDetectionModel_EAST& TextDetectionModel_EAST::setConfidenceThreshold(float confThreshold)
+{
+    TextDetectionModel_EAST_Impl::from(impl).setConfidenceThreshold(confThreshold);
+    return *this;
+}
+float TextDetectionModel_EAST::getConfidenceThreshold() const
+{
+    return TextDetectionModel_EAST_Impl::from(impl).getConfidenceThreshold();
+}
+
+TextDetectionModel_EAST& TextDetectionModel_EAST::setNMSThreshold(float nmsThreshold)
+{
+    TextDetectionModel_EAST_Impl::from(impl).setNMSThreshold(nmsThreshold);
+    return *this;
+}
+float TextDetectionModel_EAST::getNMSThreshold() const
+{
+    return TextDetectionModel_EAST_Impl::from(impl).getNMSThreshold();
+}
+
+
+
+struct TextDetectionModel_DB_Impl : public TextDetectionModel_Impl
+{
+    float binaryThreshold;
+    float polygonThreshold;
+    double unclipRatio;
+    int maxCandidates;
+
+    TextDetectionModel_DB_Impl()
+        : binaryThreshold(0.3f)
+        , polygonThreshold(0.5f)
+        , unclipRatio(2.0f)
+        , maxCandidates(0)
+    {
+        CV_TRACE_FUNCTION();
+    }
+
+    TextDetectionModel_DB_Impl(const Net& network)
+        : TextDetectionModel_DB_Impl()
+    {
+        CV_TRACE_FUNCTION();
+        initNet(network);
+    }
+
+    void setBinaryThreshold(float binaryThreshold_) { binaryThreshold = binaryThreshold_; }
+    float getBinaryThreshold() const { return binaryThreshold; }
+
+    void setPolygonThreshold(float polygonThreshold_) { polygonThreshold = polygonThreshold_; }
+    float getPolygonThreshold() const { return polygonThreshold; }
+
+    void setUnclipRatio(double unclipRatio_) { unclipRatio = unclipRatio_; }
+    double getUnclipRatio() const { return unclipRatio; }
+
+    void setMaxCandidates(int maxCandidates_) { maxCandidates = maxCandidates_; }
+    int getMaxCandidates() const { return maxCandidates; }
+
+
+    virtual
+    std::vector<cv::RotatedRect> detectTextRectangles(InputArray frame, CV_OUT std::vector<float>& confidences) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        std::vector< std::vector<Point2f> > contours = detect(frame, confidences);
+        std::vector<cv::RotatedRect> results; results.reserve(contours.size());
+        for (size_t i = 0; i < contours.size(); i++)
+        {
+            auto& contour = contours[i];
+            RotatedRect box = minAreaRect(contour);
+
+            // minArea() rect is not normalized, it may return rectangles with angle=-90 or height < width
+            const float angle_threshold = 60;  // do not expect vertical text, TODO detection algo property
+            bool swap_size = false;
+            if (box.size.width < box.size.height)  // horizontal-wide text area is expected
+                swap_size = true;
+            else if (std::fabs(box.angle) >= angle_threshold)  // don't work with vertical rectangles
+                swap_size = true;
+            if (swap_size)
+            {
+                std::swap(box.size.width, box.size.height);
+                if (box.angle < 0)
+                    box.angle += 90;
+                else if (box.angle > 0)
+                    box.angle -= 90;
+            }
+
+            results.push_back(box);
+        }
+        return results;
+    }
+
+    std::vector< std::vector<Point2f> > detect(InputArray frame, CV_OUT std::vector<float>& confidences) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        std::vector< std::vector<Point2f> > results;
+
+        std::vector<Mat> outs;
+        processFrame(frame, outs);
+        CV_Assert(outs.size() == 1);
+        Mat binary = outs[0];
+
+        // Threshold
+        Mat bitmap;
+        threshold(binary, bitmap, binaryThreshold, 255, THRESH_BINARY);
+
+        // Scale ratio
+        float scaleHeight = (float)(frame.rows()) / (float)(binary.size[0]);
+        float scaleWidth = (float)(frame.cols()) / (float)(binary.size[1]);
+
+        // Find contours
+        std::vector< std::vector<Point> > contours;
+        bitmap.convertTo(bitmap, CV_8UC1);
+        findContours(bitmap, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
+
+        // Candidate number limitation
+        size_t numCandidate = std::min(contours.size(), (size_t)(maxCandidates > 0 ? maxCandidates : INT_MAX));
+
+        for (size_t i = 0; i < numCandidate; i++)
+        {
+            std::vector<Point>& contour = contours[i];
+
+            // Calculate text contour score
+            if (contourScore(binary, contour) < polygonThreshold)
+                continue;
+
+            // Rescale
+            std::vector<Point> contourScaled; contourScaled.reserve(contour.size());
+            for (size_t j = 0; j < contour.size(); j++)
+            {
+                contourScaled.push_back(Point(int(contour[j].x * scaleWidth),
+                                              int(contour[j].y * scaleHeight)));
+            }
+
+            // Unclip
+            RotatedRect box = minAreaRect(contourScaled);
+
+            // minArea() rect is not normalized, it may return rectangles with angle=-90 or height < width
+            const float angle_threshold = 60;  // do not expect vertical text, TODO detection algo property
+            bool swap_size = false;
+            if (box.size.width < box.size.height)  // horizontal-wide text area is expected
+                swap_size = true;
+            else if (std::fabs(box.angle) >= angle_threshold)  // don't work with vertical rectangles
+                swap_size = true;
+            if (swap_size)
+            {
+                std::swap(box.size.width, box.size.height);
+                if (box.angle < 0)
+                    box.angle += 90;
+                else if (box.angle > 0)
+                    box.angle -= 90;
+            }
+
+            Point2f vertex[4];
+            box.points(vertex);  // order: bl, tl, tr, br
+            std::vector<Point2f> approx;
+            for (int j = 0; j < 4; j++)
+                approx.emplace_back(vertex[j]);
+            std::vector<Point2f> polygon;
+            unclip(approx, polygon, unclipRatio);
+            results.push_back(polygon);
+        }
+
+        confidences = std::vector<float>(contours.size(), 1.0f);
+        return results;
+    }
+
+    // According to https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py (2020-10)
+    static double contourScore(const Mat& binary, const std::vector<Point>& contour)
+    {
+        Rect rect = boundingRect(contour);
+        int xmin = std::max(rect.x, 0);
+        int xmax = std::min(rect.x + rect.width, binary.cols - 1);
+        int ymin = std::max(rect.y, 0);
+        int ymax = std::min(rect.y + rect.height, binary.rows - 1);
+
+        Mat binROI = binary(Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
+
+        Mat mask = Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8U);
+        std::vector<Point> roiContour;
+        for (size_t i = 0; i < contour.size(); i++) {
+            Point pt = Point(contour[i].x - xmin, contour[i].y - ymin);
+            roiContour.push_back(pt);
+        }
+        std::vector<std::vector<Point>> roiContours = {roiContour};
+        fillPoly(mask, roiContours, Scalar(1));
+        double score = cv::mean(binROI, mask).val[0];
+
+        return score;
+    }
+
+    // According to https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py (2020-10)
+    static void unclip(const std::vector<Point2f>& inPoly, std::vector<Point2f> &outPoly, const double unclipRatio)
+    {
+        double area = contourArea(inPoly);
+        double length = arcLength(inPoly, true);
+        CV_Assert(length > FLT_EPSILON);
+        double distance = area * unclipRatio / length;
+
+        size_t numPoints = inPoly.size();
+        std::vector<std::vector<Point2f>> newLines;
+        for (size_t i = 0; i < numPoints; i++) {
+            std::vector<Point2f> newLine;
+            Point pt1 = inPoly[i];
+            Point pt2 = inPoly[(i - 1) % numPoints];
+            Point vec = pt1 - pt2;
+            float unclipDis = (float)(distance / norm(vec));
+            Point2f rotateVec = Point2f(vec.y * unclipDis, -vec.x * unclipDis);
+            newLine.push_back(Point2f(pt1.x + rotateVec.x, pt1.y + rotateVec.y));
+            newLine.push_back(Point2f(pt2.x + rotateVec.x, pt2.y + rotateVec.y));
+            newLines.push_back(newLine);
+        }
+
+        size_t numLines = newLines.size();
+        for (size_t i = 0; i < numLines; i++) {
+            Point2f a = newLines[i][0];
+            Point2f b = newLines[i][1];
+            Point2f c = newLines[(i + 1) % numLines][0];
+            Point2f d = newLines[(i + 1) % numLines][1];
+            Point2f pt;
+            Point2f v1 = b - a;
+            Point2f v2 = d - c;
+            double cosAngle = (v1.x * v2.x + v1.y * v2.y) / (norm(v1) * norm(v2));
+
+            if( fabs(cosAngle) > 0.7 ) {
+                pt.x = (b.x + c.x) * 0.5;
+                pt.y = (b.y + c.y) * 0.5;
+            } else {
+                double denom = a.x * (double)(d.y - c.y) + b.x * (double)(c.y - d.y) +
+                               d.x * (double)(b.y - a.y) + c.x * (double)(a.y - b.y);
+                double num = a.x * (double)(d.y - c.y) + c.x * (double)(a.y - d.y) + d.x * (double)(c.y - a.y);
+                double s = num / denom;
+
+                pt.x = a.x + s*(b.x - a.x);
+                pt.y = a.y + s*(b.y - a.y);
+            }
+
+
+            outPoly.push_back(pt);
+        }
+    }
+
+
+    static inline
+    TextDetectionModel_DB_Impl& from(const std::shared_ptr<Model::Impl>& ptr)
+    {
+        CV_Assert(ptr);
+        return *((TextDetectionModel_DB_Impl*)ptr.get());
+    }
+};
+
+
+TextDetectionModel_DB::TextDetectionModel_DB()
+    : TextDetectionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<TextDetectionModel_DB_Impl>());
+}
+
+TextDetectionModel_DB::TextDetectionModel_DB(const Net& network)
+    : TextDetectionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<TextDetectionModel_DB_Impl>(network));
+}
+
+TextDetectionModel_DB& TextDetectionModel_DB::setBinaryThreshold(float binaryThreshold)
+{
+    TextDetectionModel_DB_Impl::from(impl).setBinaryThreshold(binaryThreshold);
+    return *this;
+}
+float TextDetectionModel_DB::getBinaryThreshold() const
+{
+    return TextDetectionModel_DB_Impl::from(impl).getBinaryThreshold();
+}
+
+TextDetectionModel_DB& TextDetectionModel_DB::setPolygonThreshold(float polygonThreshold)
+{
+    TextDetectionModel_DB_Impl::from(impl).setPolygonThreshold(polygonThreshold);
+    return *this;
+}
+float TextDetectionModel_DB::getPolygonThreshold() const
+{
+    return TextDetectionModel_DB_Impl::from(impl).getPolygonThreshold();
+}
+
+TextDetectionModel_DB& TextDetectionModel_DB::setUnclipRatio(double unclipRatio)
+{
+    TextDetectionModel_DB_Impl::from(impl).setUnclipRatio(unclipRatio);
+    return *this;
+}
+double TextDetectionModel_DB::getUnclipRatio() const
+{
+    return TextDetectionModel_DB_Impl::from(impl).getUnclipRatio();
+}
+
+TextDetectionModel_DB& TextDetectionModel_DB::setMaxCandidates(int maxCandidates)
+{
+    TextDetectionModel_DB_Impl::from(impl).setMaxCandidates(maxCandidates);
+    return *this;
+}
+int TextDetectionModel_DB::getMaxCandidates() const
+{
+    return TextDetectionModel_DB_Impl::from(impl).getMaxCandidates();
+}
+
+
 }} // namespace
diff --git a/modules/dnn/src/nms.inl.hpp b/modules/dnn/src/nms.inl.hpp
index 89e3adfcf5..7b84839c02 100644
--- a/modules/dnn/src/nms.inl.hpp
+++ b/modules/dnn/src/nms.inl.hpp
@@ -62,12 +62,15 @@ inline void GetMaxScoreIndex(const std::vector<float>& scores, const float thres
 //    score_threshold: a threshold used to filter detection results.
 //    nms_threshold: a threshold used in non maximum suppression.
 //    top_k: if not > 0, keep at most top_k picked indices.
+//    limit: early terminate once the # of picked indices has reached it.
 //    indices: the kept indices of bboxes after nms.
 template <typename BoxType>
 inline void NMSFast_(const std::vector<BoxType>& bboxes,
       const std::vector<float>& scores, const float score_threshold,
       const float nms_threshold, const float eta, const int top_k,
-      std::vector<int>& indices, float (*computeOverlap)(const BoxType&, const BoxType&))
+      std::vector<int>& indices,
+      float (*computeOverlap)(const BoxType&, const BoxType&),
+      int limit = std::numeric_limits<int>::max())
 {
     CV_Assert(bboxes.size() == scores.size());
 
@@ -86,8 +89,12 @@ inline void NMSFast_(const std::vector<BoxType>& bboxes,
             float overlap = computeOverlap(bboxes[idx], bboxes[kept_idx]);
             keep = overlap <= adaptive_threshold;
         }
-        if (keep)
+        if (keep) {
             indices.push_back(idx);
+            if (indices.size() >= limit) {
+                break;
+            }
+        }
         if (keep && eta < 1 && adaptive_threshold > 0.5) {
           adaptive_threshold *= eta;
         }
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index 8de7ba26e2..7bb277d102 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
         int32_t group_;
         bool bias_term_;
         UMat swizzled_weights_umat;
-        UMat weights_half;
-        UMat bias_half;
         UMat bottom_data2_;
 
         int32_t bottom_index_;
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
index 47224c3be6..855a21e08f 100644
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -88,13 +88,13 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
             size_t global_copy[2];
             global_copy[0] = width;
             global_copy[1] = height;
-            oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
-            oclk_gemm_copy.set(1, image);
-            oclk_gemm_copy.set(2, offset);
-            oclk_gemm_copy.set(3, width);
-            oclk_gemm_copy.set(4, height);
-            oclk_gemm_copy.set(5, ld);
-            oclk_gemm_copy.run(2, global_copy, NULL, false);
+            oclk_gemm_copy
+                .args(
+                    ocl::KernelArg::PtrReadOnly(buffer),
+                    image, offset,
+                    width, height,
+                    ld)
+                .run(2, global_copy, NULL, false);
         }
     } else {
         if (!padding)
@@ -112,14 +112,14 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
             global_copy[0] = padded_width;
             global_copy[1] = padded_height;
 
-            oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
-            oclk_gemm_copy.set(1, image);
-            oclk_gemm_copy.set(2, offset);
-            oclk_gemm_copy.set(3, width);
-            oclk_gemm_copy.set(4, height);
-            oclk_gemm_copy.set(5, ld);
-
-            oclk_gemm_copy.run(2, global_copy, NULL, false);
+            bool res = oclk_gemm_copy
+                .args(
+                    ocl::KernelArg::PtrReadOnly(buffer),
+                    image, offset,
+                    width, height,
+                    ld)
+                .run(2, global_copy, NULL, false);
+            CV_Assert(res);
         }
     }
 
@@ -465,8 +465,12 @@ static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
         kernel_name += "_float";
     }
 
+    bool isBetaZero = beta == 0;
+
     String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
-    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
+    if (isBetaZero)
+        opts += " -DZERO_BETA=1";
+
     size_t local[2] = {};
     size_t global[2] = {};
     if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
@@ -496,27 +500,37 @@ static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
         local[1] = ly;
     }
 
-    int arg_idx = 0;
-    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
-    oclk_gemm_float.set(arg_idx++, offA);
-    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
-    oclk_gemm_float.set(arg_idx++, offB);
-    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
-    oclk_gemm_float.set(arg_idx++, offC);
-    oclk_gemm_float.set(arg_idx++, M);
-    oclk_gemm_float.set(arg_idx++, N);
-    oclk_gemm_float.set(arg_idx++, K);
-    oclk_gemm_float.set(arg_idx++, (float)alpha);
-    oclk_gemm_float.set(arg_idx++, (float)beta);
-
     bool ret = true;
-    if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
+    if (TransB == CblasNoTrans || TransA != CblasNoTrans)
+    {
+        // _NN_
         int stride = 256;
         for (int start_index = 0; start_index < K; start_index += stride) {
-            oclk_gemm_float.set(arg_idx, start_index);
-            ret = oclk_gemm_float.run(2, global, local, false);
+            ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
+            oclk_gemm_float.args(
+                ocl::KernelArg::PtrReadOnly(A), offA,
+                ocl::KernelArg::PtrReadOnly(B), offB,
+                isBetaZero ? ocl::KernelArg::PtrWriteOnly(C) : ocl::KernelArg::PtrReadWrite(C), offC,
+                M, N, K,
+                (float)alpha, (float)beta,
+                start_index
+            );
+            ret &= oclk_gemm_float.run(2, global, local, false);
         }
-    } else {
+    }
+    else
+    {
+        // _NT_
+        //C.reshape(1,1).setTo(0xfe00 /*FP16 NAN*/);  // stable one-line reproducer for https://github.com/opencv/opencv/issues/18937
+        //C.reshape(1,1).setTo(0);  // non-optimal fixup (and not accurate)
+        ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
+        oclk_gemm_float.args(
+            ocl::KernelArg::PtrReadOnly(A), offA,
+            ocl::KernelArg::PtrReadOnly(B), offB,
+            isBetaZero ? ocl::KernelArg::PtrWriteOnly(C) : ocl::KernelArg::PtrReadWrite(C), offC,
+            M, N, K,
+            (float)alpha, (float)beta
+        );
         ret = oclk_gemm_float.run(2, global, local, false);
     }
     return ret;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index bf56d3a8a1..059fc8f402 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
         fused_eltwise_ = false;
     }
 
-    if (use_half_ && bias_half.empty() && !bias.empty())
-        convertFp16(bias, bias_half);
+    if (use_half_ && !bias.empty())
+        CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
 
-    if (use_half_ && weights_half.empty())
-        convertFp16(weight, weights_half);
+    if (use_half_)
+        CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
 
-    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
+    prepareKernel(bottom, top, weight, bias, numImages);
     if (bestKernelConfig.empty())
         return false;
-    return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
+    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
 }
 
 template<typename Dtype>
@@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
                                      kernel_h_ * (int)alignSize(kernel_w_, 2),
                                      (use_half_) ? CV_16SC1 : CV_32FC1);
 
-    UMat swizzled_weights_tmp;
-    if (use_half_)
-        swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
-
     if (!interleave) {
-        cl_uint argIdx = 0;
         int32_t channels = channels_ / group_;
 
-        ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
-                                     cv::ocl::dnn::conv_spatial_helper_oclsrc);
+        ocl::Kernel oclk_copy_weight(
+            use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
+            cv::ocl::dnn::conv_spatial_helper_oclsrc,
+            use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
+        );
         if (oclk_copy_weight.empty())
             return false;
 
-        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
-        if (use_half_)
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
-        else
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
-        oclk_copy_weight.set(argIdx++, kernel_w_);
-        oclk_copy_weight.set(argIdx++, kernel_h_);
-        oclk_copy_weight.set(argIdx++, channels);
-        oclk_copy_weight.set(argIdx++, num_output_);
-        oclk_copy_weight.set(argIdx++, swizzled_factor);
+        oclk_copy_weight.args(
+            ocl::KernelArg::PtrReadOnly(weight),
+            ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
+            kernel_w_,
+            kernel_h_,
+            channels,
+            num_output_,
+            swizzled_factor
+        );
 
         size_t global_work_size_copy[3] = {
             (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         }
     } else {
         // assumption: kernel dimension is 2
-        Mat weightMat = weight.getMat(ACCESS_READ);
-        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
+        Mat weightMat;
         Mat swizzledWeightMat;
+        UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
         if (use_half_)
-            swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
+        {
+            CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
+            convertFp16(weight, weight_tmp);
+            weightMat = weight_tmp.getMat(ACCESS_READ);
+            swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
+        }
         else
+        {
+            weightMat = weight.getMat(ACCESS_READ);
             swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+        }
+
+        CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
+        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
         Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
 
         int interleavedRows = (kernel_w_ / 2) * 2;
@@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         int blockWidth = swizzled_factor;  // should equal to simd size.
         int rowAlignment = 32;
         size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
-        Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
-        CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
+        cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
         for (int od = 0; od < M_; od++)
             for (int id = 0; id < channels_; id++)
                 for (int r = 0; r < kernel_h_; r++)
                     for (int c = 0; c < kernel_w_; c++)
                         tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
                             cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
+
         interleaveMatrix(cpu_swizzled_weight,
-                         tmpSwizzledWeight,
+                         tmpSwizzledWeight.data(),
                          kernel_w_ * kernel_h_ * channels_, M_,
                          interleavedRows,
                          nonInterleavedRows,
                          blockWidth,
                          rowAlignment);
-        free(tmpSwizzledWeight);
-    }
 
-    if (use_half_)
-        convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
+        // unmap OpenCL buffers
+        weightMat.release();
+
+        if (use_half_)
+            convertFp16(swizzledWeightMat, swizzled_weights_umat);
+    }
 
     return true;
 }
@@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         cl_uint argIdx = 0;
         setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-        if (use_half_)
-            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
-        else
-            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
         if (bias_term_)
             kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
         kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                 kernel.set(argIdx++, image_offset);
-                if (use_half_)
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
-                else
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                 kernel.set(argIdx++, kernel_offset);
                 if (bias_term_)
                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
 
     UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
 
-    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
+    calculateBenchmark(bottom, benchData, weight, bias, numImages);
 
     if (run_auto_tuning_ || force_auto_tuning_)
     {
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index ad3d903d68..7826f2b0ca 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -10,11 +10,14 @@
 #include "../graph_simplifier.hpp"
 #include "onnx_graph_simplifier.hpp"
 
+#include <opencv2/core/utils/logger.hpp>
 #include <queue>
 
 namespace cv { namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 
+extern bool DNN_DIAGNOSTICS_RUN;
+
 // This wrapper can behave differently for fake input nodes and real graph nodes.
 class ONNXNodeWrapper : public ImportNodeWrapper
 {
@@ -249,6 +252,40 @@ public:
     }
 };
 
+class NormalizeSubgraph4 : public NormalizeSubgraphBase
+{
+public:
+    NormalizeSubgraph4() : NormalizeSubgraphBase(1)
+    {
+        int input = addNodeToMatch("");
+        int mul = addNodeToMatch("Mul", input, input);
+        int sum = addNodeToMatch("ReduceSum", mul);
+        int eps = addNodeToMatch("");
+        int max = addNodeToMatch("Max", sum, eps);
+        int sqrt = addNodeToMatch("Sqrt", max);
+        int reciprocal = addNodeToMatch("Reciprocal", sqrt);
+        addNodeToMatch("Mul", input, reciprocal);
+        setFusedNode("Normalize", input);
+    }
+};
+
+class NormalizeSubgraph5 : public NormalizeSubgraphBase
+{
+public:
+    NormalizeSubgraph5() : NormalizeSubgraphBase(1)
+    {
+        int input = addNodeToMatch("");
+        int mul = addNodeToMatch("Mul", input, input);
+        int sum = addNodeToMatch("ReduceSum", mul);
+        int clip = addNodeToMatch("Clip", sum);
+        int sqrt = addNodeToMatch("Sqrt", clip);
+        int one = addNodeToMatch("Constant");
+        int div = addNodeToMatch("Div", one, sqrt);
+        addNodeToMatch("Mul", input, div);
+        setFusedNode("Normalize", input);
+    }
+};
+
 class GatherCastSubgraph : public Subgraph
 {
 public:
@@ -314,6 +351,19 @@ public:
     }
 };
 
+class MishSubgraph : public Subgraph
+{
+public:
+    MishSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int softplus = addNodeToMatch("Softplus", input);
+        int tanh = addNodeToMatch("Tanh", softplus);
+        addNodeToMatch("Mul", input, tanh);
+        setFusedNode("Mish", input);
+    }
+};
+
 class MulCastSubgraph : public Subgraph
 {
 public:
@@ -512,6 +562,9 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
     subgraphs.push_back(makePtr<ExpandSubgraph>());
+    subgraphs.push_back(makePtr<MishSubgraph>());
+    subgraphs.push_back(makePtr<NormalizeSubgraph4>());
+    subgraphs.push_back(makePtr<NormalizeSubgraph5>());
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
@@ -589,8 +642,17 @@ Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto)
         }
     }
     else
-        CV_Error(Error::StsUnsupportedFormat, "Unsupported data type: " +
-                        opencv_onnx::TensorProto_DataType_Name(datatype));
+    {
+        std::string errorMsg = "Unsupported data type: " +
+                            opencv_onnx::TensorProto_DataType_Name(datatype);
+
+        if (!DNN_DIAGNOSTICS_RUN)
+        {
+            CV_Error(Error::StsUnsupportedFormat, errorMsg);
+        }
+        CV_LOG_ERROR(NULL, errorMsg);
+        return blob;
+    }
     if (tensor_proto.dims_size() == 0)
         blob.dims = 1;  // To force 1-dimensional cv::Mat for scalars.
     return blob;
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 859b595b7f..98714bbd5c 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -8,6 +8,8 @@
 #include "../precomp.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
+#include <opencv2/dnn/layer_reg.private.hpp>
+
 #include <opencv2/core/utils/logger.defines.hpp>
 #undef CV_LOG_STRIP_LEVEL
 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
@@ -37,6 +39,7 @@ namespace cv {
 namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 
+extern bool DNN_DIAGNOSTICS_RUN;
 
 class ONNXImporter
 {
@@ -58,11 +61,12 @@ class ONNXImporter
     void addConstant(const std::string& name, const Mat& blob);
     void addLayer(LayerParams& layerParams,
                   const opencv_onnx::NodeProto& node_proto);
+    static const std::set<String>& getSupportedTypes();
 
 public:
 
     ONNXImporter(Net& net, const char *onnxFile)
-        : dstNet(net)
+        : dstNet(net), utilNet()
     {
         hasDynamicShapes = false;
         CV_Assert(onnxFile);
@@ -83,7 +87,7 @@ public:
     }
 
     ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer)
-        : dstNet(net)
+        : dstNet(net), utilNet()
     {
         hasDynamicShapes = false;
         CV_LOG_DEBUG(NULL, "DNN/ONNX: processing in-memory ONNX model (" << sizeBuffer << " bytes)");
@@ -110,6 +114,7 @@ public:
 
 protected:
     Net& dstNet;
+    Net utilNet;
 
     opencv_onnx::GraphProto graph_proto;
     std::string framework_name;
@@ -182,6 +187,10 @@ std::map<std::string, Mat> ONNXImporter::getGraphTensors(
     tensor_proto = graph_proto.initializer(i);
     Mat mat = getMatFromTensor(tensor_proto);
     releaseONNXTensor(tensor_proto);
+
+    if (DNN_DIAGNOSTICS_RUN && mat.empty())
+        continue;
+
     layers_weights.insert(std::make_pair(tensor_proto.name(), mat));
   }
   return layers_weights;
@@ -201,118 +210,132 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
         opencv_onnx::AttributeProto attribute_proto = node_proto.attribute(i);
         std::string attribute_name = attribute_proto.name();
 
-        if(attribute_name == "kernel_shape")
+        try
         {
-            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("kernel_size", parse(attribute_proto.ints()));
-        }
-        else if(attribute_name == "strides")
-        {
-            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("stride", parse(attribute_proto.ints()));
-        }
-        else if(attribute_name == "pads")
-        {
-            if (node_proto.op_type() == "Pad")
+            if(attribute_name == "kernel_shape")
             {
-                // Padding layer.
-                // Paddings are in order begin0, begin1, .. beginN, end0, end1, ..., endN.
-                // We need to shuffle it to begin0, end0, begin1, end1, ...
-                CV_Assert(attribute_proto.ints_size() % 2 == 0);
-                const int dims = attribute_proto.ints_size() / 2;
-                std::vector<int32_t> paddings;
-                paddings.reserve(attribute_proto.ints_size());
-                for (int i = 0; i < dims; ++i)
+                CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+                lp.set("kernel_size", parse(attribute_proto.ints()));
+            }
+            else if(attribute_name == "strides")
+            {
+                CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+                lp.set("stride", parse(attribute_proto.ints()));
+            }
+            else if(attribute_name == "pads")
+            {
+                if (node_proto.op_type() == "Pad")
                 {
-                    paddings.push_back(attribute_proto.ints(i));
-                    paddings.push_back(attribute_proto.ints(dims + i));
+                    // Padding layer.
+                    // Paddings are in order begin0, begin1, .. beginN, end0, end1, ..., endN.
+                    // We need to shuffle it to begin0, end0, begin1, end1, ...
+                    CV_Assert(attribute_proto.ints_size() % 2 == 0);
+                    const int dims = attribute_proto.ints_size() / 2;
+                    std::vector<int32_t> paddings;
+                    paddings.reserve(attribute_proto.ints_size());
+                    for (int i = 0; i < dims; ++i)
+                    {
+                        paddings.push_back(attribute_proto.ints(i));
+                        paddings.push_back(attribute_proto.ints(dims + i));
+                    }
+                    lp.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size()));
                 }
-                lp.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size()));
+                else
+                {
+                    // Convolution or pooling.
+                    CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
+                    lp.set("pad", parse(attribute_proto.ints()));
+                }
+            }
+            else if(attribute_name == "auto_pad")
+            {
+                if (attribute_proto.s() == "SAME_UPPER" || attribute_proto.s() == "SAME_LOWER") {
+                    lp.set("pad_mode",  "SAME");
+                }
+                else if (attribute_proto.s() == "VALID") {
+                    lp.set("pad_mode", "VALID");
+                }
+            }
+            else if(attribute_name == "dilations")
+            {
+                CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+                lp.set("dilation", parse(attribute_proto.ints()));
+            }
+            else if (attribute_proto.has_i())
+            {
+                ::google::protobuf::int64 src = attribute_proto.i();
+                if (src < std::numeric_limits<int32_t>::min() || src > std::numeric_limits<int32_t>::max())
+                    CV_Error(Error::StsOutOfRange, "Input is out of OpenCV 32S range");
+                else
+                    lp.set(attribute_name, saturate_cast<int32_t>(src));
+            }
+            else if (attribute_proto.has_f())
+            {
+                lp.set(attribute_name, attribute_proto.f());
+            }
+            else if (attribute_proto.has_s())
+            {
+                lp.set(attribute_name, attribute_proto.s());
+            }
+            else if (attribute_proto.floats_size() > 0)
+            {
+                lp.set(attribute_name, DictValue::arrayReal(
+                    attribute_proto.floats().data(), attribute_proto.floats_size()));
+            }
+            else if (attribute_proto.ints_size() > 0)
+            {
+                lp.set(attribute_name, parse(attribute_proto.ints()));
+            }
+            else if (attribute_proto.has_t())
+            {
+                opencv_onnx::TensorProto tensor = attribute_proto.t();
+                Mat blob = getMatFromTensor(tensor);
+                lp.blobs.push_back(blob);
+            }
+            else if (attribute_proto.has_g())
+            {
+                CV_Error(Error::StsNotImplemented, cv::format("DNN/ONNX/Attribute[%s]: 'Graph' is not supported", attribute_name.c_str()));
+            }
+            else if (attribute_proto.graphs_size() > 0)
+            {
+                CV_Error(Error::StsNotImplemented,
+                        cv::format("DNN/ONNX/Attribute[%s]: 'Graphs' (%d) in attributes is not supported",
+                                attribute_name.c_str(), attribute_proto.graphs_size())
+                );
+            }
+            else if (attribute_proto.strings_size() > 0)
+            {
+                std::string msg = cv::format("DNN/ONNX/Attribute[%s]: 'Strings' (%d) are not supported",
+                        attribute_name.c_str(), attribute_proto.strings_size());
+                CV_LOG_ERROR(NULL, msg);
+                for (int i = 0; i < attribute_proto.strings_size(); i++)
+                {
+                    CV_LOG_ERROR(NULL, "    Attribute[" << attribute_name << "].string(" << i << ") = '" << attribute_proto.strings(i) << "'");
+                }
+                CV_Error(Error::StsNotImplemented, msg);
+            }
+            else if (attribute_proto.tensors_size() > 0)
+            {
+                CV_Error(Error::StsNotImplemented,
+                        cv::format("DNN/ONNX/Attribute[%s]: 'Tensors' (%d) in attributes are not supported",
+                                attribute_name.c_str(), attribute_proto.tensors_size())
+                );
             }
             else
             {
-                // Convolution or pooling.
-                CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
-                lp.set("pad", parse(attribute_proto.ints()));
+                CV_Error(Error::StsNotImplemented, cv::format("DNN/ONNX/Attribute[%s]: unsupported attribute format", attribute_name.c_str()));
             }
         }
-        else if(attribute_name == "auto_pad")
+        catch (const cv::Exception& e)
         {
-            if (attribute_proto.s() == "SAME_UPPER" || attribute_proto.s() == "SAME_LOWER") {
-                lp.set("pad_mode",  "SAME");
-            }
-            else if (attribute_proto.s() == "VALID") {
-                lp.set("pad_mode", "VALID");
-            }
-        }
-        else if(attribute_name == "dilations")
-        {
-            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("dilation", parse(attribute_proto.ints()));
-        }
-        else if (attribute_proto.has_i())
-        {
-            ::google::protobuf::int64 src = attribute_proto.i();
-            if (src < std::numeric_limits<int32_t>::min() || src > std::numeric_limits<int32_t>::max())
-                CV_Error(Error::StsOutOfRange, "Input is out of OpenCV 32S range");
-            else
-                lp.set(attribute_name, saturate_cast<int32_t>(src));
-        }
-        else if (attribute_proto.has_f())
-        {
-            lp.set(attribute_name, attribute_proto.f());
-        }
-        else if (attribute_proto.has_s())
-        {
-            lp.set(attribute_name, attribute_proto.s());
-        }
-        else if (attribute_proto.floats_size() > 0)
-        {
-            lp.set(attribute_name, DictValue::arrayReal(
-                attribute_proto.floats().data(), attribute_proto.floats_size()));
-        }
-        else if (attribute_proto.ints_size() > 0)
-        {
-            lp.set(attribute_name, parse(attribute_proto.ints()));
-        }
-        else if (attribute_proto.has_t())
-        {
-            opencv_onnx::TensorProto tensor = attribute_proto.t();
-            Mat blob = getMatFromTensor(tensor);
-            lp.blobs.push_back(blob);
-        }
-        else if (attribute_proto.has_g())
-        {
-            CV_Error(Error::StsNotImplemented, cv::format("DNN/ONNX/Attribute[%s]: 'Graph' is not supported", attribute_name.c_str()));
-        }
-        else if (attribute_proto.graphs_size() > 0)
-        {
-            CV_Error(Error::StsNotImplemented,
-                    cv::format("DNN/ONNX/Attribute[%s]: 'Graphs' (%d) in attributes is not supported",
-                            attribute_name.c_str(), attribute_proto.graphs_size())
-            );
-        }
-        else if (attribute_proto.strings_size() > 0)
-        {
-            std::string msg = cv::format("DNN/ONNX/Attribute[%s]: 'Strings' (%d) are not supported",
-                    attribute_name.c_str(), attribute_proto.strings_size());
-            CV_LOG_ERROR(NULL, msg);
-            for (int i = 0; i < attribute_proto.strings_size(); i++)
+            CV_UNUSED(e);
+            if (DNN_DIAGNOSTICS_RUN)
             {
-                CV_LOG_ERROR(NULL, "    Attribute[" << attribute_name << "].string(" << i << ") = '" << attribute_proto.strings(i) << "'");
+                CV_LOG_ERROR(NULL, "DNN/ONNX: Potential problem with processing attributes for node " << node_proto.name() << " Attribute " << attribute_name.c_str()
+                );
+                continue;
             }
-            CV_Error(Error::StsNotImplemented, msg);
-        }
-        else if (attribute_proto.tensors_size() > 0)
-        {
-            CV_Error(Error::StsNotImplemented,
-                    cv::format("DNN/ONNX/Attribute[%s]: 'Tensors' (%d) in attributes are not supported",
-                            attribute_name.c_str(), attribute_proto.tensors_size())
-            );
-        }
-        else
-        {
-            CV_Error(Error::StsNotImplemented, cv::format("DNN/ONNX/Attribute[%s]: unsupported attribute format", attribute_name.c_str()));
+            throw;
         }
     }
     return lp;
@@ -338,7 +361,11 @@ Mat ONNXImporter::getBlob(const std::string& input_name)
 void ONNXImporter::addLayer(LayerParams& layerParams,
                             const opencv_onnx::NodeProto& node_proto)
 {
-    int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+    int id;
+    if (DNN_DIAGNOSTICS_RUN)
+        id = utilNet.addLayer(layerParams.name, layerParams.type, layerParams);
+    else
+        id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
     for (int i = 0; i < node_proto.output_size(); ++i)
     {
         layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
@@ -351,7 +378,10 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
         const std::string& input_name = node_proto.input(j);
         IterLayerId_t layerId = layer_id.find(input_name);
         if (layerId != layer_id.end()) {
-            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
+            if (DNN_DIAGNOSTICS_RUN)
+                utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
+            else
+                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
             ++inpNum;
             // Collect input shapes.
             IterShape_t shapeIt = outShapes.find(input_name);
@@ -360,7 +390,11 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
         }
     }
     // Compute shape of output blob for this layer.
-    Ptr<Layer> layer = dstNet.getLayer(id);  // FIXIT: avoid instantiation of layers during the import stage
+    Ptr<Layer> layer;
+    if (DNN_DIAGNOSTICS_RUN)
+        layer = utilNet.getLayer(id);
+    else
+        layer = dstNet.getLayer(id);  // FIXIT: avoid instantiation of layers during the import stage
     layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
     for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
     {
@@ -437,8 +471,37 @@ void ONNXImporter::populateNet()
             layer_id.insert(std::make_pair(name, LayerInfo(0, netInputs.size() - 1)));
         }
     }
+    utilNet.setInputsNames(netInputs);
     dstNet.setInputsNames(netInputs);
 
+    if (DNN_DIAGNOSTICS_RUN) {
+        auto &supportedTypes = getSupportedTypes();
+        for (int li = 0; li < layersSize; li++) {
+            const opencv_onnx::NodeProto &node_proto = graph_proto.node(li);
+            std::string name = node_proto.output(0);
+            std::string layer_type = node_proto.op_type();
+            auto registered = supportedTypes.find(layer_type);
+            if (registered == supportedTypes.end()) {
+                CV_LOG_ERROR(NULL, "DNN/ONNX: NOTE: Potential problem with creating node " << name<< " with type " << layer_type << ".\n Type "
+                    << layer_type << " IS NOT SUPPORTED!\n"
+                );
+            }
+        }
+        auto oldConstBlobs = constBlobs;
+        auto oldOutShapes = outShapes;
+        auto oldLayerId = layer_id;
+        CV_LOG_INFO(NULL, "DNN/ONNX: start diagnostic run!");
+        for (int li = 0; li < layersSize; li++) {
+            const opencv_onnx::NodeProto &node_proto = graph_proto.node(li);
+            handleNode(node_proto);
+        }
+        CV_LOG_INFO(NULL, "DNN/ONNX: diagnostic run completed!");
+        constBlobs = oldConstBlobs;
+        outShapes = oldOutShapes;
+        layer_id = oldLayerId;
+        enableModelDiagnostics(false);
+    }
+
     for(int li = 0; li < layersSize; li++)
     {
         const opencv_onnx::NodeProto& node_proto = graph_proto.node(li);
@@ -448,6 +511,80 @@ void ONNXImporter::populateNet()
     CV_LOG_DEBUG(NULL, "DNN/ONNX: import completed!");
 }
 
+const std::set<String>& ONNXImporter::getSupportedTypes()
+{
+    static const std::set<String> layerTypes = {
+        "MaxPool",
+        "AveragePool",
+        "GlobalAveragePool",
+        "GlobalMaxPool",
+        "ReduceMean",
+        "ReduceSum",
+        "ReduceMax",
+        "Slice",
+        "Split",
+        "Add",
+        "Sum",
+        "Sub",
+        "Pow",
+        "Max",
+        "Neg",
+        "Constant",
+        "LSTM",
+        "ImageScaler",
+        "Clip",
+        "LeakyRelu",
+        "Relu",
+        "Elu",
+        "Tanh",
+        "PRelu",
+        "LRN",
+        "InstanceNormalization",
+        "BatchNormalization",
+        "Gemm",
+        "MatMul",
+        "Mul",
+        "Div",
+        "Conv",
+        "ConvTranspose",
+        "Transpose",
+        "Squeeze",
+        "Flatten",
+        "Unsqueeze",
+        "Expand",
+        "Reshape",
+        "Pad",
+        "Shape",
+        "Cast",
+        "ConstantOfShape",
+        "ConstantFill",
+        "Gather",
+        "Concat",
+        "Resize",
+        "Upsample",
+        "SoftMax",
+        "Softmax",
+        "LogSoftmax",
+        "DetectionOutput",
+        "Interp",
+        "CropAndResize",
+        "ROIPooling",
+        "PSROIPooling",
+        "ChannelsPReLU",
+        "Sigmoid",
+        "Swish",
+        "Mish",
+        "AbsVal",
+        "BNLL",
+        "MaxUnpool",
+        "Dropout",
+        "Identity",
+        "Crop",
+        "Normalize"
+    };
+    return layerTypes;
+}
+
 void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
 {
     opencv_onnx::NodeProto node_proto = node_proto_;  // TODO FIXIT
@@ -458,11 +595,11 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
     CV_LOG_DEBUG(NULL, "DNN/ONNX: processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
             << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
     );
-
+    LayerParams layerParams;
     try
     {
         // FIXIT not all cases can be repacked into "LayerParams". Importer should handle such cases directly for each "layer_type"
-        LayerParams layerParams = getLayerParams(node_proto);
+        layerParams = getLayerParams(node_proto);
 
         layerParams.name = name;
         layerParams.type = layer_type;
@@ -503,7 +640,7 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                 MatShape targetShape;
                 std::vector<bool> shouldDelete(inpShape.size(), false);
                 for (int i = 0; i < axes.size(); i++) {
-                    int axis = clamp(axes.get<int>(i), inpShape.size());
+                    int axis = normalize_axis(axes.get<int>(i), inpShape.size());
                     shouldDelete[axis] = true;
                 }
                 for (int axis = 0; axis < inpShape.size(); ++axis){
@@ -515,7 +652,7 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
 
                 if (inpShape.size() == 3 && axes.size() <= 2)
                 {
-                    int axis = clamp(axes.get<int>(0), inpShape.size());
+                    int axis = normalize_axis(axes.get<int>(0), inpShape.size());
                     CV_CheckNE(axis, 0, "");
 
                     LayerParams reshapeLp;
@@ -539,8 +676,8 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     avgLp.set("pool", pool);
                     if (axes.size() == 2)
                     {
-                        CV_CheckEQ(clamp(axes.get<int>(0), inpShape.size()), 1, "Unsupported mode");
-                        CV_CheckEQ(clamp(axes.get<int>(1), inpShape.size()), 2, "Unsupported mode");
+                        CV_CheckEQ(normalize_axis(axes.get<int>(0), inpShape.size()), 1, "Unsupported mode");
+                        CV_CheckEQ(normalize_axis(axes.get<int>(1), inpShape.size()), 2, "Unsupported mode");
                         avgLp.set("global_pooling", true);
                     }
                     else
@@ -560,9 +697,9 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
 
                     CV_Assert(axes.size() <= inpShape.size() - 2);
                     std::vector<int> kernel_size(inpShape.size() - 2, 1);
-                    if (axes.size() == 1 && (clamp(axes.get<int>(0), inpShape.size()) <= 1))
+                    if (axes.size() == 1 && (normalize_axis(axes.get<int>(0), inpShape.size()) <= 1))
                     {
-                        int axis = clamp(axes.get<int>(0), inpShape.size());
+                        int axis = normalize_axis(axes.get<int>(0), inpShape.size());
                         MatShape newShape = inpShape;
                         newShape[axis + 1] = total(newShape, axis + 1);
                         newShape.resize(axis + 2);
@@ -584,7 +721,7 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     else
                     {
                         for (int i = 0; i < axes.size(); i++) {
-                            int axis = clamp(axes.get<int>(i), inpShape.size());
+                            int axis = normalize_axis(axes.get<int>(i), inpShape.size());
                             CV_Assert_N(axis >= 2 + i, axis < inpShape.size());
                             kernel_size[axis - 2] = inpShape[axis];
                         }
@@ -641,20 +778,11 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             int axis = 0;
             std::vector<int> begin;
             std::vector<int> end;
+            std::vector<int> steps;
             int inp_size = node_proto.input_size();
 
             if (inp_size == 1)
             {
-                if (layerParams.has("steps"))
-                {
-                    DictValue steps = layerParams.get("steps");
-                    for (int i = 0; i < steps.size(); ++i)
-                    {
-                        if (steps.get<int>(i) != 1)
-                            CV_Error(Error::StsNotImplemented,
-                                "Slice layer only supports steps = 1");
-                    }
-                }
                 if (layerParams.has("axes")) {
                     DictValue axes = layerParams.get("axes");
                     for (int i = 1; i < axes.size(); ++i) {
@@ -677,7 +805,7 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     int finish = ends.get<int>(i);
                     end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
                 }
-            } else {
+            } else { // inp_size > 1
                 CV_Assert(inp_size >= 3);
                 for (int i = 1; i < inp_size; i++) {
                     CV_Assert(constBlobs.find(node_proto.input(i)) != constBlobs.end());
@@ -711,6 +839,12 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                 if (inp_size == 5) {
                     CV_Assert(constBlobs.find(node_proto.input(4)) != constBlobs.end());
                     Mat step_blob = getBlob(node_proto, 4);
+                    const int* steps_ptr = step_blob.ptr<int>();
+
+                    if (axis > 0)
+                        steps.resize(axis, 1);
+
+                    std::copy(steps_ptr, steps_ptr + step_blob.total(), std::back_inserter(steps));
 
                     // Very strange application for Slice op with tensor reversing.
                     // We just workaround it for 2d constants.
@@ -728,13 +862,15 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                             return;
                         }
                     }
-                    CV_CheckEQ(countNonZero(step_blob != 1), 0, "Slice layer only supports steps = 1");
                 }
             }
             layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
             layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
             layerParams.set("axis", axis);
 
+            if (!steps.empty())
+                layerParams.set("steps", DictValue::arrayInt(&steps[0], steps.size()));
+
             if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
             {
                 Mat inp = getBlob(node_proto, 0);
@@ -799,7 +935,11 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                         constParams.name = layerParams.name + "/const";
                         constParams.type = "Const";
                         constParams.blobs.push_back((isSub ? -1 : 1) * blob);
-                        int id = dstNet.addLayer(constParams.name, constParams.type, constParams);
+                        int id;
+                        if (DNN_DIAGNOSTICS_RUN)
+                            id = utilNet.addLayer(constParams.name, constParams.type, constParams);
+                        else
+                            id = dstNet.addLayer(constParams.name, constParams.type, constParams);
                         layer_id.insert(std::make_pair(constParams.name, LayerInfo(id, 0)));
                         outShapes[constParams.name] = shape(blob);
 
@@ -844,12 +984,19 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     powerParams.type = "Power";
                     powerParams.set("scale", -1);
 
+                    int id;
                     //Create Power layer
-                    int id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
+                    if (DNN_DIAGNOSTICS_RUN)
+                        id = utilNet.addLayer(powerParams.name, powerParams.type, powerParams);
+                    else
+                        id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
                     //Connect to input
                     IterLayerId_t layerId = layer_id.find(node_proto.input(1));
                     CV_Assert(layerId != layer_id.end());
-                    dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+                    if (DNN_DIAGNOSTICS_RUN)
+                        utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+                    else
+                        dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
                     //Add shape
                     layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
                     outShapes[powerParams.name] = outShapes[node_proto.input(1)];
@@ -1036,11 +1183,18 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             layerParams.erase("epsilon");
 
             //Create MVN layer
-            int id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
+            int id;
+            if (DNN_DIAGNOSTICS_RUN)
+                id = utilNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
+            else
+                id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
             //Connect to input
             IterLayerId_t layerId = layer_id.find(node_proto.input(0));
             CV_Assert(layerId != layer_id.end());
-            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+            if (DNN_DIAGNOSTICS_RUN)
+                utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+            else
+                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
             //Add shape
             layer_id.insert(std::make_pair(mvnParams.name, LayerInfo(id, 0)));
             outShapes[mvnParams.name] = outShapes[node_proto.input(0)];
@@ -1162,6 +1316,53 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     layerParams.type = "Scale";
                 }
             }
+            else if (!haveVariables)
+            {
+                Mat inp0 = getBlob(node_proto, 0);
+                Mat inp1 = getBlob(node_proto, 1);
+
+                if (inp0.size != inp1.size && (inp0.total() != 1 || inp1.total() != 1))
+                    CV_Error_(Error::StsNotImplemented, ("Different shapes case is not supported with constant inputs: %s", layer_type.c_str()));
+
+                if (inp0.total() == 1 && inp1.total() == 1 && inp0.dims != inp1.dims)
+                {
+                    if (inp0.dims < inp1.dims)
+                    {
+                        inp0 = inp0.reshape(1, inp1.dims, inp1.size);
+                        inp0.dims = inp1.dims;
+                    }
+                    else
+                    {
+                        inp1 = inp1.reshape(1, inp0.dims, inp0.size);
+                        inp1.dims = inp0.dims;
+                    }
+                }
+
+                Mat out;
+                if (inp0.total() != inp1.total())
+                {
+                    if (inp0.total() == 1)
+                    {
+                        float coeff = isDiv ? 1.0 / inp0.at<float>(0) : inp0.at<float>(0);
+                        multiply(inp1, coeff, out);
+                    }
+                    else
+                    {
+                        float coeff = isDiv ? 1.0 / inp1.at<float>(0) : inp1.at<float>(0);
+                        multiply(inp0, coeff, out);
+                    }
+
+                }
+                else
+                {
+                    out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
+                }
+
+                if (inp0.dims == 1 && inp1.dims == 1)
+                    out.dims = 1;  // to workaround dims == 1
+                addConstant(layerParams.name, out);
+                return;
+            }
             else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
             {
                 layerParams.type = "Eltwise";
@@ -1186,12 +1387,19 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                     powerParams.type = "Power";
                     powerParams.set("power", -1);
 
+                    int id;
                     //Create Power layer
-                    int id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
+                    if (DNN_DIAGNOSTICS_RUN)
+                        id = utilNet.addLayer(powerParams.name, powerParams.type, powerParams);
+                    else
+                        id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
                     //Connect to input
                     IterLayerId_t layerId = layer_id.find(node_proto.input(1));
                     CV_Assert(layerId != layer_id.end());
-                    dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+                    if (DNN_DIAGNOSTICS_RUN)
+                        utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+                    else
+                        dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
                     //Add shape
                     layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
                     outShapes[powerParams.name] = outShapes[node_proto.input(1)];
@@ -1201,20 +1409,6 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
                 }
                 layerParams.type = "Scale";
             }
-
-            if (!haveVariables)
-            {
-                Mat inp0 = getBlob(node_proto, 0);
-                Mat inp1 = getBlob(node_proto, 1);
-                if (inp0.size != inp1.size && inp1.total() != 1)
-                    CV_Error(Error::StsNotImplemented, "Constant multiply with different shapes");
-
-                Mat out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
-                out = out.reshape(1, inp0.dims, inp0.size);
-                out.dims = inp0.dims;  // to workaround dims == 1
-                addConstant(layerParams.name, out);
-                return;
-            }
         }
         else if (layer_type == "Conv")
         {
@@ -1343,7 +1537,7 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
             {
                 Mat input = getBlob(node_proto, 0);
-                int axis = clamp(layerParams.get<int>("axis", 1), input.dims);
+                int axis = normalize_axis(layerParams.get<int>("axis", 1), input.dims);
 
                 std::vector<int> out_size(&input.size[0], &input.size[0] + axis);
                 out_size.push_back(input.total(axis));
@@ -1733,9 +1927,26 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             if (!hasVariableInps)
             {
                 std::vector<Mat> inputs(node_proto.input_size()), concatenated;
+                // Due constant folding we can get inputs with different number of dimensions
+                // Insert the missing dimension to inputs
+                MatShape inputShape;
                 for (size_t i = 0; i < inputs.size(); ++i)
                 {
                     inputs[i] = getBlob(node_proto, i);
+                    if (inputs[i].size.dims() > inputShape.size())
+                    {
+                        inputShape = shape(inputs[i]);
+                    }
+                }
+
+                // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
+                int axis = layerParams.get<int>("axis", 1);
+                for (size_t i = 0; i < inputs.size(); ++i)
+                {
+                    MatShape targetShape = inputShape;
+                    targetShape[axis] = shape(inputs[i])[axis];
+                    CV_CheckEQ(total(targetShape), total(shape(inputs[i])), "");
+                    inputs[i] = inputs[i].reshape(0, targetShape);
                 }
                 runLayer(layerParams, inputs, concatenated);
 
@@ -1873,9 +2084,31 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
     }
     catch (const cv::Exception& e)
     {
-        CV_LOG_ERROR(NULL, "DNN/ONNX: ERROR during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
-                << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
-        );
+        if (DNN_DIAGNOSTICS_RUN)
+        {
+            CV_LOG_ERROR(NULL, "DNN/ONNX: Potential problem during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
+                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str()) << "\n" << e.msg
+            );
+            auto registeredLayers = getLayerFactoryImpl();
+            if (registeredLayers.find(layerParams.type) != registeredLayers.end())
+            {
+                try
+                {
+                    Ptr<Layer> layer = LayerFactory::createLayerInstance(layerParams.type, layerParams);
+                }
+                catch (const std::exception& e)
+                {
+                    CV_LOG_ERROR(NULL, "DNN/ONNX: Layer of type " << layerParams.type << "(" << layer_type << ") cannot be created with parameters " << layerParams << ". Error: " << e.what()
+                    );
+                }
+            }
+        }
+        else
+        {
+            CV_LOG_ERROR(NULL, "DNN/ONNX: ERROR during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
+                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
+            );
+        }
         for (int i = 0; i < node_proto.input_size(); i++)
         {
             CV_LOG_INFO(NULL, "    Input[" << i << "] = '" << node_proto.input(i) << "'");
@@ -1884,7 +2117,16 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
         {
             CV_LOG_INFO(NULL, "    Output[" << i << "] = '" << node_proto.output(i) << "'");
         }
-        CV_Error(Error::StsError, cv::format("Node [%s]:(%s) parse error: %s", layer_type.c_str(), name.c_str(), e.what()));
+        if (DNN_DIAGNOSTICS_RUN)
+        {
+            for (int i = 0; i < node_proto.output_size(); ++i)
+            {
+                layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(0, i)));
+                outShapes[node_proto.output(i)] = outShapes[node_proto.input(0)];
+            }
+        }
+        else
+            CV_Error(Error::StsError, cv::format("Node [%s]:(%s) parse error: %s", layer_type.c_str(), name.c_str(), e.what()));
     }
 }
 
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index b7cdc2ad94..d9b98404c3 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -655,6 +655,22 @@ InferenceEngine::Core& getCore(const std::string& id)
 }
 #endif
 
+static bool detectArmPlugin_()
+{
+    InferenceEngine::Core& ie = getCore("CPU");
+    const std::vector<std::string> devices = ie.GetAvailableDevices();
+    for (std::vector<std::string>::const_iterator i = devices.begin(); i != devices.end(); ++i)
+    {
+        if (i->find("CPU") != std::string::npos)
+        {
+            const std::string name = ie.GetMetric(*i, METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>();
+            CV_LOG_INFO(NULL, "CPU plugin: " << name);
+            return name.find("arm_compute::NEON") != std::string::npos;
+        }
+    }
+    return false;
+}
+
 #if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
 static bool detectMyriadX_(std::string device)
 {
@@ -1185,6 +1201,12 @@ bool isMyriadX()
     return myriadX;
 }
 
+bool isArmComputePlugin()
+{
+    static bool armPlugin = getInferenceEngineCPUType() == CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE;
+    return armPlugin;
+}
+
 static std::string getInferenceEngineVPUType_()
 {
     static std::string param_vpu_type = utils::getConfigurationParameterString("OPENCV_DNN_IE_VPU_TYPE", "");
@@ -1223,6 +1245,14 @@ cv::String getInferenceEngineVPUType()
     return vpu_type;
 }
 
+cv::String getInferenceEngineCPUType()
+{
+    static cv::String cpu_type = detectArmPlugin_() ?
+                                 CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE :
+                                 CV_DNN_INFERENCE_ENGINE_CPU_TYPE_X86;
+    return cpu_type;
+}
+
 #else  // HAVE_INF_ENGINE
 
 cv::String getInferenceEngineBackendType()
@@ -1238,6 +1268,11 @@ cv::String getInferenceEngineVPUType()
 {
     CV_Error(Error::StsNotImplemented, "This OpenCV build doesn't include InferenceEngine support");
 }
+
+cv::String getInferenceEngineCPUType()
+{
+    CV_Error(Error::StsNotImplemented, "This OpenCV build doesn't include InferenceEngine support");
+}
 #endif  // HAVE_INF_ENGINE
 
 
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index fcd1a6927d..f52334bc45 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -28,10 +28,12 @@
 #define INF_ENGINE_RELEASE_2020_3 2020030000
 #define INF_ENGINE_RELEASE_2020_4 2020040000
 #define INF_ENGINE_RELEASE_2021_1 2021010000
+#define INF_ENGINE_RELEASE_2021_2 2021020000
+#define INF_ENGINE_RELEASE_2021_3 2021030000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2021.1 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2021_1
+#warning("IE version have not been provided via command-line. Using 2021.3 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2021_3
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
@@ -254,8 +256,11 @@ CV__DNN_INLINE_NS_BEGIN
 
 bool isMyriadX();
 
+bool isArmComputePlugin();
+
 CV__DNN_INLINE_NS_END
 
+
 InferenceEngine::Core& getCore(const std::string& id);
 
 template<typename T = size_t>
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index b900e6add6..68f0dd7268 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -140,3 +140,14 @@ __kernel void ELUForward(const int n, __global const T* in, __global T* out)
     out[index] = (src >= 0.f) ? src : exp(src) - 1;
   }
 }
+
+__kernel void ExpForward(const int n, __global const T* in, __global T* out,
+                         const KERNEL_ARG_DTYPE normScale,
+                         const KERNEL_ARG_DTYPE normShift)
+{
+  int index = get_global_id(0);
+  if (index < n)
+  {
+    out[index] = exp(normShift + normScale * in[index]);
+  }
+}
diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl
index 9d5a89f7b1..33d9db57c8 100644
--- a/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@@ -39,9 +39,14 @@
 //
 //M*/
 
+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
 
 __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
     (__global Dtype* weightIn,
diff --git a/modules/dnn/src/opencl/gemm_buffer.cl b/modules/dnn/src/opencl/gemm_buffer.cl
index 8cbc34dde5..b345983aee 100644
--- a/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
@@ -90,6 +90,12 @@
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
 #endif
 
+#ifdef ZERO_BETA
+#define BETA_ZERO_CHECK(b0, v)  (b0)
+#else
+#define BETA_ZERO_CHECK(b0, v)  (v)
+#endif
+
 #define VEC_SIZE        4
 #define LWG_HEIGHT      4
 #define TILE_M          8
@@ -143,14 +149,14 @@ __kernel void TEMPLATE(gemm_buffer_NN, Dtype)(
     int row6 = mad24(global_y, TILE_M, 6) < M ? 6 : border;
     int row7 = mad24(global_y, TILE_M, 7) < M ? 7 : border;
 
-    Dtype4 dot00 = (start_index != 0) ? vload4(0, dst_write0) : beta * vload4(0, dst_write0);
-    Dtype4 dot01 = (start_index != 0) ? vload4(0, dst_write0 + 1 * N) : beta * vload4(0, dst_write0 + 1 * N);
-    Dtype4 dot02 = (start_index != 0) ? vload4(0, dst_write0 + 2 * N) : beta * vload4(0, dst_write0 + 2 * N);
-    Dtype4 dot03 = (start_index != 0) ? vload4(0, dst_write0 + 3 * N) : beta * vload4(0, dst_write0 + 3 * N);
-    Dtype4 dot04 = (start_index != 0) ? vload4(0, dst_write0 + 4 * N) : beta * vload4(0, dst_write0 + 4 * N);
-    Dtype4 dot05 = (start_index != 0) ? vload4(0, dst_write0 + 5 * N) : beta * vload4(0, dst_write0 + 5 * N);
-    Dtype4 dot06 = (start_index != 0) ? vload4(0, dst_write0 + 6 * N) : beta * vload4(0, dst_write0 + 6 * N);
-    Dtype4 dot07 = (start_index != 0) ? vload4(0, dst_write0 + 7 * N) : beta * vload4(0, dst_write0 + 7 * N);
+    Dtype4 dot00 = (start_index != 0) ? vload4(0, dst_write0) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0));
+    Dtype4 dot01 = (start_index != 0) ? vload4(0, dst_write0 + 1 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 1 * N));
+    Dtype4 dot02 = (start_index != 0) ? vload4(0, dst_write0 + 2 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 2 * N));
+    Dtype4 dot03 = (start_index != 0) ? vload4(0, dst_write0 + 3 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 3 * N));
+    Dtype4 dot04 = (start_index != 0) ? vload4(0, dst_write0 + 4 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 4 * N));
+    Dtype4 dot05 = (start_index != 0) ? vload4(0, dst_write0 + 5 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 5 * N));
+    Dtype4 dot06 = (start_index != 0) ? vload4(0, dst_write0 + 6 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 6 * N));
+    Dtype4 dot07 = (start_index != 0) ? vload4(0, dst_write0 + 7 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 7 * N));
 
     int end_index = min(start_index + 256, K);
     int w = start_index;
@@ -579,7 +585,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
     output = (local_x == 5) ? _dot.s5 : output; \
     output = (local_x == 6) ? _dot.s6 : output; \
     output = (local_x == 7) ? _dot.s7 : output; \
-    dst_write0[0] = mad(output, alpha, beta * dst_write0[0]); \
+    dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \
     dst_write0 += N;
 
     if(global_x < N && global_y * 8 < M) {
@@ -765,7 +771,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
     output = (local_x == 5) ? _dot.s5 : output; \
     output = (local_x == 6) ? _dot.s6 : output; \
     output = (local_x == 7) ? _dot.s7 : output; \
-    dst_write0[0] = mad(output, alpha, beta * dst_write0[0]); \
+    dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \
     dst_write0 += N;
 
     if(global_x < N && global_y * 8 < M) {
@@ -819,8 +825,9 @@ void TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype)(
     const Dtype4 b1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};
 #pragma unroll
     for(int j = 0; j < rows; ++j) {
-      dot0[j] += b0 * vload4(i, srcb_read + j * K);
-      dot1[j] += b1 * vload4(i, srcb_read + j * K);
+      Dtype4 a = vload4(i, srcb_read + j * K);
+      dot0[j] += b0 * a;
+      dot1[j] += b1 * a;
     }
 
     i += get_local_size(0);
@@ -859,11 +866,19 @@ void TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype)(
     }
   }
 
+  barrier(CLK_LOCAL_MEM_FENCE);
   if(lid == 0) {
 #pragma unroll
     for(int j = 0; j < rows; ++j) {
-      dstc0[(x_gid * 4  + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];
-      dstc1[(x_gid * 4  + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];
+#ifdef ZERO_BETA
+      Dtype a0 = alpha * work_each0[j];
+      Dtype a1 = alpha * work_each1[j];
+#else
+      Dtype a0 = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];
+      Dtype a1 = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];
+#endif
+      dstc0[(x_gid * 4  + j)] = a0;
+      dstc1[(x_gid * 4  + j)] = a1;
     }
   }
 }
@@ -952,9 +967,15 @@ __kernel void TEMPLATE(gemm_buffer_NT_M_2,Dtype)(
       }
     }
 
-    if(lid == 0) {
+    if(lid == 0)
+    {
+#ifdef ZERO_BETA
+      dstc0[x_gid] = alpha * work0[0];
+      dstc1[x_gid] = alpha * work1[0];
+#else
       dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
       dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
+#endif
     }
   }
 }
@@ -1058,10 +1079,17 @@ void TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype)(
   if(lid == 0) {
 #pragma unroll
     for(int j = 0; j < rows; ++j) {
+#ifdef ZERO_BETA
+      dstc0[(x_gid * 4  + j)] = alpha * work_each0[j];
+      dstc1[(x_gid * 4  + j)] = alpha * work_each1[j];
+      dstc2[(x_gid * 4  + j)] = alpha * work_each2[j];
+      dstc3[(x_gid * 4  + j)] = alpha * work_each3[j];
+#else
       dstc0[(x_gid * 4  + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];
       dstc1[(x_gid * 4  + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];
       dstc2[(x_gid * 4  + j)] = alpha * work_each2[j] + beta * dstc2[(x_gid * 4 + j)];
       dstc3[(x_gid * 4  + j)] = alpha * work_each3[j] + beta * dstc3[(x_gid * 4 + j)];
+#endif
     }
   }
 }
@@ -1179,10 +1207,17 @@ __kernel void TEMPLATE(gemm_buffer_NT_M_4,Dtype)(
     }
 
     if(lid == 0) {
+#ifdef ZERO_BETA
+      dstc0[x_gid] = alpha * work0[0];
+      dstc1[x_gid] = alpha * work1[0];
+      dstc2[x_gid] = alpha * work2[0];
+      dstc3[x_gid] = alpha * work3[0];
+#else
       dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
       dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
       dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];
       dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];
+#endif
     }
   }
 }
@@ -1320,6 +1355,16 @@ __kernel void TEMPLATE(gemm_buffer_NT_M_8,Dtype)(
   }
 
   if(lid == 0) {
+#ifdef ZERO_BETA
+    dstc0[x_gid] = alpha * work0[0];
+    dstc1[x_gid] = alpha * work1[0];
+    dstc2[x_gid] = alpha * work2[0];
+    dstc3[x_gid] = alpha * work3[0];
+    dstc4[x_gid] = alpha * work4[0];
+    dstc5[x_gid] = alpha * work5[0];
+    dstc6[x_gid] = alpha * work6[0];
+    dstc7[x_gid] = alpha * work7[0];
+#else
     dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];
     dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];
     dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];
@@ -1328,6 +1373,7 @@ __kernel void TEMPLATE(gemm_buffer_NT_M_8,Dtype)(
     dstc5[x_gid] = alpha * work5[0] + beta * dstc5[x_gid];
     dstc6[x_gid] = alpha * work6[0] + beta * dstc6[x_gid];
     dstc7[x_gid] = alpha * work7[0] + beta * dstc7[x_gid];
+#endif
   }
 }
 #undef SLM_SIZE
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 45dfdad9e8..65695b8504 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -11,6 +11,12 @@ Implementation of Tensorflow models parser
 
 #include "../precomp.hpp"
 
+#include <opencv2/core/utils/logger.defines.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+#undef CV_LOG_STRIP_LEVEL
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_PROTOBUF
 #include "tf_io.hpp"
 
@@ -93,7 +99,7 @@ void blobShapeFromTensor(const tensorflow::TensorProto &tensor, MatShape& shape)
                 shape[i] = (int)_shape.dim(i).size();
         }
         else
-            shape.resize(1, 1);  // Scalar.
+            shape.resize(1, 1);  // Scalar. // FIXIT: should be empty
     }
     else
     {
@@ -258,7 +264,7 @@ const tensorflow::AttrValue& getLayerAttr(const tensorflow::NodeDef &layer, cons
     return layer.attr().at(name);
 }
 
-static int getDataLayout(const tensorflow::NodeDef& layer)
+static DataLayout getDataLayout(const tensorflow::NodeDef& layer)
 {
     if (hasLayerAttr(layer, "data_format"))
     {
@@ -280,13 +286,32 @@ static inline std::string getNodeName(const std::string& tensorName)
     return tensorName.substr(0, tensorName.rfind(':'));
 }
 
-static inline int getDataLayout(const std::string& layerName,
-                                const std::map<String, int>& data_layouts)
+static inline
+DataLayout getDataLayout(
+        const std::string& layerName,
+        const std::map<String, DataLayout>& data_layouts
+)
 {
-    std::map<String, int>::const_iterator it = data_layouts.find(getNodeName(layerName));
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layerName));
     return it != data_layouts.end() ? it->second : DATA_LAYOUT_UNKNOWN;
 }
 
+static
+bool hasAllOnes(const Mat &inputs, int startPos, int endPos)
+{
+    CV_CheckLE(inputs.dims, 2, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLT((size_t)endPos, inputs.total(), "");
+
+    for (int i = startPos; i < endPos; i++)
+    {
+        if (inputs.at<int>(i) != 1 && inputs.at<int>(i) != -1)
+            return false;
+    }
+    return true;
+}
+
 void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 {
     if (hasLayerAttr(layer, "strides"))
@@ -389,7 +414,7 @@ Pin parsePin(const std::string &name)
 {
     Pin pin(name);
 
-    size_t delimiter_pos = name.find_first_of(":");
+    size_t delimiter_pos = name.find_first_of(':');
     if (delimiter_pos != std::string::npos)
     {
         pin.name = name.substr(0, delimiter_pos);
@@ -439,15 +464,20 @@ void ExcludeLayer(tensorflow::GraphDef& net, const int layer_index, const int in
         net.mutable_node()->DeleteSubrange(layer_index, 1);
 }
 
-class TFImporter {
+class TFImporter
+{
 public:
-    TFImporter(const char *model, const char *config = NULL);
-    TFImporter(const char *dataModel, size_t lenModel,
+    TFImporter(Net& net, const char *model, const char *config = NULL);
+    TFImporter(Net& net, const char *dataModel, size_t lenModel,
                const char *dataConfig = NULL, size_t lenConfig = 0);
+protected:
+    Net& dstNet;
+    void populateNet();
 
-    void populateNet(Net dstNet);
+    void parseNode(const tensorflow::NodeDef& layer);
+
+    DataLayout predictOutputDataLayout(const tensorflow::NodeDef& layer);
 
-private:
     void kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob);
 
     void connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
@@ -467,23 +497,56 @@ private:
 
     std::vector<String> netInputsNames;
     std::vector<MatShape> netInputShapes;
+
+    std::set<String> layers_to_ignore;
+    std::map<String, DataLayout> data_layouts;
+
+    // find all Const layers for params
+    std::map<String, int> value_id;
+    // A map with constant blobs which are shared between multiple layers.
+    std::map<String, Mat> sharedWeights;
+
+    std::map<String, int> layer_id;
+
+private:
+    void addPermuteLayer(const int* order, const std::string& permName, Pin& inpId);
 };
 
-TFImporter::TFImporter(const char *model, const char *config)
+TFImporter::TFImporter(Net& net, const char *model, const char *config)
+    : dstNet(net)
 {
     if (model && model[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from file: " << model);
         ReadTFNetParamsFromBinaryFileOrDie(model, &netBin);
+    }
     if (config && config[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from file: " << config);
         ReadTFNetParamsFromTextFileOrDie(config, &netTxt);
+    }
+
+    populateNet();
 }
 
-TFImporter::TFImporter(const char *dataModel, size_t lenModel,
-                       const char *dataConfig, size_t lenConfig)
+TFImporter::TFImporter(
+        Net& net,
+        const char *dataModel, size_t lenModel,
+        const char *dataConfig, size_t lenConfig
+)
+    : dstNet(net)
 {
     if (dataModel != NULL && lenModel > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from memory (" << lenModel << " bytes)");
         ReadTFNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBin);
+    }
     if (dataConfig != NULL && lenConfig > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from memory (" << lenConfig << " bytes)");
         ReadTFNetParamsFromTextBufferOrDie(dataConfig, lenConfig, &netTxt);
+    }
+    populateNet();
 }
 
 void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
@@ -612,84 +675,98 @@ const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDe
 static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& const_layers,
                           std::set<String>& layers_to_ignore)
 {
+    CV_LOG_DEBUG(NULL, "DNN/TF: addConstNodes(): handling " << net.node_size() << " nodes...");
     for (int li = 0; li < net.node_size(); li++)
     {
         const tensorflow::NodeDef &layer = net.node(li);
         String name = layer.name();
         String type = layer.op();
 
-        if (type == "Dequantize")
+        //CV_LOG_DEBUG(NULL, "DNN/TF: layer_id=" << li << " - '" << name << "' @ " << type);
+
+        try
         {
-            // Example of Dequantize node:
-            //   name: "conv2d_1/bias"
-            //   op: "Dequantize"
-            //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
-            //   input: "conv2d_1/bias_quantized_min"
-            //   input: "conv2d_1/bias_quantized_max"
-            //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
-            //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
-            CV_Assert(layer.input_size() == 3);
-            for (int i = 0; i < 3; ++i)
-                CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
-            CV_Assert(hasLayerAttr(layer, "mode") &&
-                      getLayerAttr(layer, "mode").s() == "MIN_FIRST");
+            if (type == "Dequantize")
+            {
+                // Example of Dequantize node:
+                //   name: "conv2d_1/bias"
+                //   op: "Dequantize"
+                //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
+                //   input: "conv2d_1/bias_quantized_min"
+                //   input: "conv2d_1/bias_quantized_max"
+                //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
+                //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
+                CV_CheckEQ(layer.input_size(), 3, "Dequantize: 3 inputs is supported only");
+                for (int i = 0; i < 3; ++i)
+                    CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
+                CV_Assert(hasLayerAttr(layer, "mode") &&
+                          getLayerAttr(layer, "mode").s() == "MIN_FIRST");
 
-            int tensorId = const_layers[layer.input(0)];
-            int minId = const_layers[layer.input(1)];
-            int maxId = const_layers[layer.input(2)];
+                int tensorId = const_layers[layer.input(0)];
+                int minId = const_layers[layer.input(1)];
+                int maxId = const_layers[layer.input(2)];
 
-            tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
-                                                ->mutable_attr()->at("value")
-                                                 .mutable_tensor();
-            CV_Assert(tensor->dtype() == tensorflow::DT_QUINT8);
+                tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
+                                                    ->mutable_attr()->at("value")
+                                                     .mutable_tensor();
+                CV_CheckEQ((int)tensor->dtype(), (int)tensorflow::DT_QUINT8, "");
 
-            Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
-            Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
-            CV_Assert_N(qMin.total() == 1, qMin.type() == CV_32FC1,
-                        qMax.total() == 1, qMax.type() == CV_32FC1);
+                Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
+                Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
+                CV_CheckEQ(qMin.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMin.type(), CV_32FC1, "");
+                CV_CheckEQ(qMax.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMax.type(), CV_32FC1, "");
 
-            Mat content = getTensorContent(*tensor);
+                Mat content = getTensorContent(*tensor);
 
-            float minVal = qMin.at<float>(0);
-            float rangeScale = (qMax.at<float>(0) - minVal) / 255;
-            CV_Assert(rangeScale >= 0);
-            content.convertTo(content, CV_32FC1, rangeScale,
-                              rangeScale * cvRound(minVal / rangeScale));
+                float minVal = qMin.at<float>(0);
+                float rangeScale = (qMax.at<float>(0) - minVal) / 255;
+                CV_Assert(rangeScale >= 0);
+                content.convertTo(content, CV_32FC1, rangeScale,
+                                  rangeScale * cvRound(minVal / rangeScale));
 
-            tensor->set_dtype(tensorflow::DT_FLOAT);
-            tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
+                tensor->set_dtype(tensorflow::DT_FLOAT);
+                tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
 
-            net.mutable_node(tensorId)->set_name(name);
-            CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+                net.mutable_node(tensorId)->set_name(name);
+                CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+                layers_to_ignore.insert(name);
+                continue;
+            }
+            else if (type != "Const")
+                continue;  // only Const parameters are supported
+
+            if (layer.attr().find("value") != layer.attr().end())
+            {
+                CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            }
             layers_to_ignore.insert(name);
-            continue;
         }
-        else if (type != "Const")
-            continue;  // only Const parameters are supported
-
-        if (layer.attr().find("value") != layer.attr().end())
+        catch (const std::exception& e)
         {
-            CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't handle node='" << name << "'. Exception: " << e.what());
+            throw;
         }
-        layers_to_ignore.insert(name);
     }
+    CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
 }
 
 // If all inputs of specific layer have the same data layout we can say that
 // this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
-static int predictOutputDataLayout(const tensorflow::GraphDef& net,
-                                   const tensorflow::NodeDef& layer,
-                                   const std::map<String, int>& data_layouts)
+DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
 {
-    int layout = getDataLayout(layer);
+    DataLayout layout = getDataLayout(layer);
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from attrs)");
         return layout;
+    }
 
     // Determine layout by layer's inputs
-    std::map<String, int>::const_iterator it;
     for (int i = 0, n = layer.input_size(); i < n; ++i)
     {
-        it = data_layouts.find(getNodeName(layer.input(i)));
+        std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layer.input(i)));
         if (it != data_layouts.end())
         {
             if (layout != DATA_LAYOUT_UNKNOWN)
@@ -703,71 +780,72 @@ static int predictOutputDataLayout(const tensorflow::GraphDef& net,
     }
 
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from inputs)");
         return layout;
+    }
 
     // Determine layout by layer's consumers recursively.
-    it = data_layouts.find(layer.name());
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(layer.name());
     CV_Assert(it != data_layouts.end());
     return it->second;
 }
 
-void TFImporter::populateNet(Net dstNet)
+void TFImporter::populateNet()
 {
-    if (!netTxt.ByteSize())
-        removePhaseSwitches(netBin);
+    CV_Assert(netBin.ByteSize() || netTxt.ByteSize());
 
-    RemoveIdentityOps(netBin);
-    RemoveIdentityOps(netTxt);
+    CV_LOG_INFO(NULL, "DNN/TF: parsing model"
+        << (netBin.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netBin.versions().producer(), (int)netBin.versions().min_consumer()) : cv::String(" (N/A version info)"))
+        << ". Number of nodes = " << netBin.node_size()
+    );
 
-    if (!netTxt.ByteSize())
+    if (netTxt.ByteSize())
     {
-        simplifySubgraphs(netBin);
-        sortByExecutionOrder(netBin);
+        CV_LOG_INFO(NULL, "DNN/TF: parsing config"
+            << (netTxt.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netTxt.versions().producer(), (int)netTxt.versions().min_consumer()) : cv::String(" (N/A version info)"))
+            << ". Number of nodes = " << netTxt.node_size()
+        );
+
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+        RemoveIdentityOps(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(config) => " << netTxt.node_size() << " nodes");
+
+        sortByExecutionOrder(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(config) => " << netTxt.node_size() << " nodes");
     }
     else
     {
-        sortByExecutionOrder(netTxt);
-    }
+        removePhaseSwitches(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: removePhaseSwitches(model) => " << netBin.node_size() << " nodes");
 
-    std::set<String> layers_to_ignore;
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+
+        simplifySubgraphs(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: simplifySubgraphs(model) => " << netBin.node_size() << " nodes");
+        sortByExecutionOrder(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(model) => " << netBin.node_size() << " nodes");
+    }
 
     tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
 
     int layersSize = net.node_size();
 
-    std::map<String, int> data_layouts;
     // Pre-fill data layouts where they are set explicitly.
     // Assuming that nodes are in topological order
-    for (int i = net.node_size() - 1; i >= 0; --i)
+    for (int i = layersSize - 1; i >= 0; --i)
     {
         const tensorflow::NodeDef& layer = net.node(i);
         std::string name = layer.name();
 
-        int layout = getDataLayout(layer);
-        std::map<String, int>::iterator it = data_layouts.find(name);
-        if (it != data_layouts.end())
-        {
-            if (layout != DATA_LAYOUT_UNKNOWN)
-            {
-                if (it->second == DATA_LAYOUT_UNKNOWN)
-                    it->second = layout;
-                else if (it->second != layout)
-                {
-                    it->second = DATA_LAYOUT_UNKNOWN;
-                    layout = DATA_LAYOUT_UNKNOWN;
-                }
-            }
-            else
-                layout = it->second;
-        }
-        else
-            data_layouts[name] = layout;
+        CV_LOG_DEBUG(NULL, "DNN/TF: node(" << i << " - '" << name << "') propagating layout...");
 
-        // Specify input layers to have the same data layout.
-        for (int j = 0; j < layer.input_size(); ++j)
+        try
         {
-            name = getNodeName(layer.input(j));
-            it = data_layouts.find(name);
+            DataLayout layout = getDataLayout(layer);
+            std::map<String, DataLayout>::iterator it = data_layouts.find(name);
             if (it != data_layouts.end())
             {
                 if (layout != DATA_LAYOUT_UNKNOWN)
@@ -775,38 +853,105 @@ void TFImporter::populateNet(Net dstNet)
                     if (it->second == DATA_LAYOUT_UNKNOWN)
                         it->second = layout;
                     else if (it->second != layout)
+                    {
                         it->second = DATA_LAYOUT_UNKNOWN;
+                        layout = DATA_LAYOUT_UNKNOWN;
+                    }
                 }
+                else
+                    layout = it->second;
             }
             else
                 data_layouts[name] = layout;
+
+            // Specify input layers to have the same data layout.
+            for (int j = 0; j < layer.input_size(); ++j)
+            {
+                name = getNodeName(layer.input(j));
+                it = data_layouts.find(name);
+                if (it != data_layouts.end())
+                {
+                    if (layout != DATA_LAYOUT_UNKNOWN)
+                    {
+                        if (it->second == DATA_LAYOUT_UNKNOWN)
+                            it->second = layout;
+                        else if (it->second != layout)
+                            it->second = DATA_LAYOUT_UNKNOWN;
+                    }
+                }
+                else
+                    data_layouts[name] = layout;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't propagate layout for node='" << name << "'. Exception: " << e.what());
+            throw;
         }
     }
 
-    // find all Const layers for params
-    std::map<String, int> value_id;
-    // A map with constant blobs which are shared between multiple layers.
-    std::map<String, Mat> sharedWeights;
     addConstNodes(netBin, value_id, layers_to_ignore);
     addConstNodes(netTxt, value_id, layers_to_ignore);
 
-    std::map<String, int> layer_id;
 
     for (int li = 0; li < layersSize; li++)
     {
-        tensorflow::NodeDef layer = net.node(li);
-        String name = layer.name();
-        String type = layer.op();
+        const tensorflow::NodeDef& layer = net.node(li);
+
+        const std::string name = layer.name();
+        const std::string type = layer.op();
+        const int ninputs = layer.input_size();
+        CV_LOG_DEBUG(NULL, "DNN/TF: (" << li << "/" << layersSize << ") Parse layer " << name << " @ " << type << " with " << ninputs << " inputs");
+
+        parseNode(layer);
+    }
+
+    for (size_t i = 0; i < netInputsNames.size(); i++)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: Model input: " << i << " - '" << netInputsNames[i] << "'");
+        CV_Assert(!netInputsNames[i].empty());
+    }
+    dstNet.setInputsNames(netInputsNames);
+    CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
+}
+
+void TFImporter::addPermuteLayer(const int* order, const std::string& permName, Pin& inpId)
+{
+    LayerParams permLP;
+    permLP.set("order", DictValue::arrayInt<const int*>(order, 4));
+    CV_Assert(layer_id.find(permName) == layer_id.end());
+    int permId = dstNet.addLayer(permName, "Permute", permLP);
+    layer_id[permName] = permId;
+    connect(layer_id, dstNet, inpId, permId, 0);
+    inpId = Pin(permName);
+}
+
+void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
+{
+    tensorflow::NodeDef layer = layer_;
+
+    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+
+    /*const*/ std::string name = layer.name();
+    /*const*/ std::string type = layer.op();
+    /*const*/ int num_inputs = layer.input_size();
+
+    try
+    {
         LayerParams layerParams;
 
-        if(layers_to_ignore.find(name) != layers_to_ignore.end())
-            continue;
+        if (layers_to_ignore.find(name) != layers_to_ignore.end())
+        {
+            CV_LOG_DEBUG(NULL, "DNN/TF:     ignored");
+            return;
+        }
 
-        int predictedLayout = predictOutputDataLayout(net, layer, data_layouts);
+        DataLayout predictedLayout = predictOutputDataLayout(layer);
         data_layouts[name] = predictedLayout;
 
         if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad" || type == "MirrorPad" || type == "Conv3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             // The first node of dilated convolution subgraph.
             // Extract input node, dilation rate and paddings.
             std::string input = layer.input(0);
@@ -824,7 +969,7 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "input"
                 // input: "SpaceToBatchND/block_shape"
                 // input: "SpaceToBatchND/paddings"
-                CV_Assert(layer.input_size() == 3);
+                CV_CheckEQ(num_inputs, 3, "");
 
                 DictValue dilation = parseDims(getConstBlob(layer, value_id, 1));
                 CV_Assert(dilation.size() == 2);
@@ -839,10 +984,14 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("pad_w", paddings.at<float>(2));
 
                 CV_Assert(next_layers.size() == 1);
-                layer = net.node(next_layers[0].second);
                 layers_to_ignore.insert(next_layers[0].first);
+
+                // FIXIT don't override, rewrite this code
+                layer = net.node(next_layers[0].second);
                 name = layer.name();
                 type = layer.op();
+                num_inputs = layer.input_size();
+                CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
             }
             else if (type == "Pad" || type == "MirrorPad")
             {
@@ -876,7 +1025,7 @@ void TFImporter::populateNet(Net dstNet)
                     layer_id[name] = id;
 
                     connect(layer_id, dstNet, parsePin(input), id, 0);
-                    continue;
+                    return;
                 }
                 else
                 {
@@ -886,10 +1035,14 @@ void TFImporter::populateNet(Net dstNet)
                     layerParams.set("pad_h", paddings.at<int32_t>(4));
                     layerParams.set("pad_w", paddings.at<int32_t>(6));
 
-                    layer = net.node(next_layers[0].second);
                     layers_to_ignore.insert(next_layers[0].first);
+
+                    // FIXIT don't override, rewrite this code
+                    layer = net.node(next_layers[0].second);
                     name = layer.name();
                     type = layer.op();
+                    num_inputs = layer.input_size();
+                    CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
                 }
             }
 
@@ -1011,13 +1164,14 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "BiasAdd" || type == "Add" || type == "AddV2" || type == "Sub" || type=="AddN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             bool haveConst = false;
-            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            for(int ii = 0; !haveConst && ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 haveConst = value_id.find(input.name) != value_id.end();
             }
-            CV_Assert(!haveConst || layer.input_size() == 2);
+            CV_Assert(!haveConst || num_inputs == 2);
 
             if (haveConst)
             {
@@ -1054,7 +1208,7 @@ void TFImporter::populateNet(Net dstNet)
                 int id = dstNet.addLayer(name, "Eltwise", layerParams);
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1065,7 +1219,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MatMul")
         {
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
 
             // For the object detection networks, TensorFlow Object Detection API
             // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
@@ -1077,7 +1231,7 @@ void TFImporter::populateNet(Net dstNet)
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
 
-            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");
+            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");  // FIXIT Use layers fusion instead
             if (next_layers.empty())
             {
                 next_layers = getNextLayers(net, name, "Add");
@@ -1105,8 +1259,18 @@ void TFImporter::populateNet(Net dstNet)
 
             int kernel_blob_index = -1;
             const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernel_blob_index);
-            blobFromTensor(kernelTensor, layerParams.blobs[0]);
-            releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
+            const String kernelTensorName = layer.input(kernel_blob_index);
+            std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
+            if (sharedWeightsIt == sharedWeights.end())
+            {
+                blobFromTensor(kernelTensor, layerParams.blobs[0]);
+                releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
+                sharedWeights[kernelTensorName] = layerParams.blobs[0];
+            }
+            else
+            {
+                layerParams.blobs[0] = sharedWeightsIt->second;
+            }
 
             if (kernel_blob_index == 1) { // In this case output is computed by x*W formula - W should be transposed
                 Mat data = layerParams.blobs[0].t();
@@ -1135,44 +1299,57 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Reshape")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
-            int inpLayout = getDataLayout(layer.input(0), data_layouts);
+            DataLayout inpLayout = getDataLayout(layer.input(0), data_layouts);
             // There are two possible implementations: reshape an input using
             // predefined sizes or use a second input blob as a source of new shape.
             if (value_id.find(layer.input(1)) != value_id.end())
             {
                 Mat newShape = getTensorContent(getConstBlob(layer, value_id, 1));
-                if (newShape.total() == 4)
+                int newShapeSize = newShape.total();
+                bool hasSwap = false;
+                if (newShapeSize == 4 && hasAllOnes(newShape, 0, 2))
                 {
                     // NHWC->NCHW
                     std::swap(*newShape.ptr<int32_t>(0, 2), *newShape.ptr<int32_t>(0, 3));
                     std::swap(*newShape.ptr<int32_t>(0, 1), *newShape.ptr<int32_t>(0, 2));
+                    hasSwap = true;
                 }
                 if (inpLayout == DATA_LAYOUT_NHWC)
                 {
-                    if (newShape.total() != 4 || newShape.at<int>(1) == 1)
+                    if (newShapeSize >= 2 || newShape.at<int>(1) == 1)
                     {
-                        LayerParams permLP;
                         int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
-
-                        std::string permName = name + "/nchw";
-                        CV_Assert(layer_id.find(permName) == layer_id.end());
-                        int permId = dstNet.addLayer(permName, "Permute", permLP);
-                        layer_id[permName] = permId;
-                        connect(layer_id, dstNet, inpId, permId, 0);
-                        inpId = Pin(permName);
-                        inpLayout = DATA_LAYOUT_NCHW;
+                        addPermuteLayer(order, name + "/nhwc", inpId);
+                        if (newShapeSize < 4)
+                        {
+                            inpLayout = DATA_LAYOUT_NCHW;
+                        }
+                        else
+                        {
+                            inpLayout = DATA_LAYOUT_NHWC;
+                        }
                     }
                 }
-                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShape.total()));
+                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShapeSize));
 
                 int id = dstNet.addLayer(name, "Reshape", layerParams);
                 layer_id[name] = id;
 
                 // one input only
                 connect(layer_id, dstNet, inpId, id, 0);
-                data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
+                inpId = Pin(name);
+
+                if ((inpLayout == DATA_LAYOUT_NHWC || inpLayout == DATA_LAYOUT_UNKNOWN || inpLayout == DATA_LAYOUT_PLANAR) &&
+                    newShapeSize == 4 && !hasSwap)
+                {
+                    int order[] = {0, 3, 1, 2};  // Transform back to OpenCV's NCHW.
+                    addPermuteLayer(order, name + "/nchw", inpId);
+                    inpLayout = DATA_LAYOUT_NCHW;
+                }
+
+                data_layouts[name] = newShapeSize == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
             }
             else
             {
@@ -1185,6 +1362,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Flatten" || type == "Squeeze")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
             int inpLayout = getDataLayout(layer.input(0), data_layouts);
             if (type == "Squeeze")
@@ -1231,6 +1409,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Transpose")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Mat perm = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(perm.type() == CV_32SC1);
             int* permData = (int*)perm.data;
@@ -1304,6 +1483,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "LRN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if(hasLayerAttr(layer, "alpha")) {
                 layerParams.set("alpha", getLayerAttr(layer, "alpha").f());
             }
@@ -1322,11 +1502,12 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "LRN", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "Concat" || type == "ConcatV2")
         {
-            int axisId = (type == "Concat" ? 0 : layer.input_size() - 1);
+            CV_CheckGT(num_inputs, 0, "");
+            int axisId = (type == "Concat" ? 0 : num_inputs - 1);
             int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
 
             if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
@@ -1337,7 +1518,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // input(0) or input(n-1) is concat_dim
             int from = (type == "Concat" ? 1 : 0);
-            int to = (type == "Concat" ? layer.input_size() : layer.input_size() - 1);
+            int to = (type == "Concat" ? num_inputs : num_inputs - 1);
 
             for (int ii = from; ii < to; ii++)
             {
@@ -1370,6 +1551,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MaxPool" || type == "MaxPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "max");
 
             setKSize(layerParams, layer);
@@ -1381,10 +1563,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "AvgPool" || type == "AvgPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "ave");
             layerParams.set("ave_pool_padded_area", false);
             setKSize(layerParams, layer);
@@ -1394,11 +1577,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "MaxPoolGrad")
         {
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             layerParams.set("pool_k_h", 0);
             layerParams.set("pool_k_w", 0);
@@ -1457,7 +1640,7 @@ void TFImporter::populateNet(Net dstNet)
             // TODO: slicing input may be Const op
             // TODO: slicing kernels for convolutions - in current implementation it is impossible
             // TODO: add parsing num of slices parameter
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             // num_split
             // 1st blob is dims tensor
             int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
@@ -1480,7 +1663,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input_node"
             // input: "Slice/begin"
             // input: "Slice/size"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat sizes = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_Assert_N(!begins.empty(), !sizes.empty());
@@ -1505,7 +1688,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "StridedSlice")
         {
-            CV_Assert(layer.input_size() == 4);
+            CV_CheckEQ(num_inputs, 4, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
             Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
@@ -1544,8 +1727,9 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Mul" || type == "RealDiv")
         {
+            CV_CheckGT(num_inputs, 0, "");
             int constId = -1;
-            for(int ii = 0; ii < layer.input_size(); ++ii)
+            for(int ii = 0; ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 if (value_id.find(input.name) != value_id.end())
@@ -1554,12 +1738,12 @@ void TFImporter::populateNet(Net dstNet)
                     break;
                 }
             }
-            CV_Assert((constId != -1) || (layer.input_size() == 2));
+            CV_Assert((constId != -1) || (num_inputs == 2));
 
             if (constId != -1)
             {
                 // Multiplication by constant.
-                CV_Assert(layer.input_size() == 2);
+                CV_CheckEQ(num_inputs, 2, "");
                 Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
                 CV_Assert(scaleMat.type() == CV_32FC1);
                 if (type == "RealDiv")
@@ -1642,8 +1826,9 @@ void TFImporter::populateNet(Net dstNet)
             {
                 // Check if all the inputs have the same shape.
                 bool equalInpShapes = true;
+                bool isShapeOnes = false;
                 MatShape outShape0;
-                for (int ii = 0; ii < layer.input_size() && !netInputShapes.empty(); ii++)
+                for (int ii = 0; ii < num_inputs && !netInputShapes.empty(); ii++)
                 {
                     Pin pin = parsePin(layer.input(ii));
                     int inpId = layer_id.find(pin.name)->second;
@@ -1662,12 +1847,14 @@ void TFImporter::populateNet(Net dstNet)
                     else if (outShape != outShape0)
                     {
                         equalInpShapes = false;
+                        isShapeOnes = isAllOnes(outShape, 2, outShape.size()) ||
+                                      isAllOnes(outShape0, 2, outShape0.size());
                         break;
                     }
                 }
 
                 int id;
-                if (equalInpShapes || netInputShapes.empty())
+                if (equalInpShapes || netInputShapes.empty() || (!equalInpShapes && isShapeOnes))
                 {
                     layerParams.set("operation", type == "RealDiv" ? "div" : "prod");
                     id = dstNet.addLayer(name, "Eltwise", layerParams);
@@ -1681,7 +1868,7 @@ void TFImporter::populateNet(Net dstNet)
 
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1698,9 +1885,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "BatchNorm/beta"
             // input: "BatchNorm/moving_mean"
             // input: "BatchNorm/moving_variance"
-            if (layer.input_size() != 5)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected gamma, beta, mean and std");
+            CV_CheckEQ(num_inputs, 5, "Expected gamma, beta, mean and std");
             Pin inpId = parsePin(layer.input(0));
 
             bool isTraining = hasLayerAttr(layer, "is_training") && getLayerAttr(layer, "is_training").b();
@@ -1768,9 +1953,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "conv2d_transpose/output_shape"
             // input: "weights"
             // input: "input"
-            if (layer.input_size() != 3)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected output shape, weights and input nodes");
+            CV_CheckEQ(num_inputs, 3, "Expected output shape, weights and input nodes");
 
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
@@ -1845,8 +2028,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "lstm_block_wrapper/w_f_diag"
             // input: "lstm_block_wrapper/w_o_diag"
             // input: "lstm_block_wrapper/bias"
-            if (layer.input_size() != 9)
-                CV_Error(Error::StsNotImplemented, "Unexpected number of input nodes");
+            CV_CheckEQ(num_inputs, 9, "Unexpected number of input nodes");
 
             if (hasLayerAttr(layer, "forget_bias"))
                 layerParams.set("forget_bias", getLayerAttr(layer, "forget_bias").f());
@@ -1912,6 +2094,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "ResizeNearestNeighbor" || type == "ResizeBilinear" || type == "FusedResizeAndPadConv2D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string convWeights = "";
             if (type == "FusedResizeAndPadConv2D")
             {
@@ -1919,30 +2102,32 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "decoder/ResizeBilinear/size"
                 // input: "decoder/decoder_conv0/Conv2D_dummy_paddings"
                 // input: "decoder/decoder_conv0/weights"
-                CV_CheckEQ(layer.input_size(), 4, "Number of input for FusedResizeAndPadConv2D");
+                CV_CheckEQ(num_inputs, 4, "Number of input for FusedResizeAndPadConv2D");
 
                 Mat paddings = getTensorContent(getConstBlob(layer, value_id, 2));
                 CV_CheckEQ(countNonZero(paddings), 0, "Unsupported mode");
 
                 convWeights = layer.input(3);
-                layer.mutable_input()->DeleteSubrange(2, 2);
+                layer.mutable_input()->DeleteSubrange(2, 2);  // FIXIT do NOT modify input model
+                num_inputs = layer.input_size();
                 name = name + "/resize";
 
                 if (hasLayerAttr(layer, "resize_align_corners"))
                 {
+                    // FIXIT do NOT modify input model
                     layer.mutable_attr()->insert(
                         ::google::protobuf::MapPair<std::string, tensorflow::AttrValue>("align_corners",
                                                                                         getLayerAttr(layer, "resize_align_corners")));
                 }
             }
-            if (layer.input_size() == 2)
+            if (num_inputs == 2)
             {
                 Mat outSize = getTensorContent(getConstBlob(layer, value_id, 1));
                 CV_CheckTypeEQ(outSize.type(), CV_32SC1, ""); CV_CheckEQ(outSize.total(), (size_t)2, "");
                 layerParams.set("height", outSize.at<int>(0, 0));
                 layerParams.set("width", outSize.at<int>(0, 1));
             }
-            else if (layer.input_size() == 3)
+            else if (num_inputs == 3)
             {
                 Mat factorHeight = getTensorContent(getConstBlob(layer, value_id, 1));
                 Mat factorWidth = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -1952,7 +2137,7 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("zoom_factor_y", factorHeight.at<float>(0));
             }
             else
-                CV_Assert(layer.input_size() == 2 || layer.input_size() == 3);
+                CV_Check(num_inputs, num_inputs == 2 || num_inputs == 3, "");
 
             if (type == "ResizeNearestNeighbor")
                 layerParams.set("interpolation", "nearest");
@@ -1962,6 +2147,9 @@ void TFImporter::populateNet(Net dstNet)
             if (hasLayerAttr(layer, "align_corners"))
                 layerParams.set("align_corners", getLayerAttr(layer, "align_corners").b());
 
+            if (hasLayerAttr(layer, "half_pixel_centers"))
+                layerParams.set("half_pixel_centers", getLayerAttr(layer, "half_pixel_centers").b());
+
             int id = dstNet.addLayer(name, "Resize", layerParams);
             layer_id[name] = id;
 
@@ -1970,12 +2158,12 @@ void TFImporter::populateNet(Net dstNet)
             // Step back to add convolution
             if (type == "FusedResizeAndPadConv2D")
             {
-                tensorflow::NodeDef* conv = net.mutable_node(li);
-                conv->clear_input();
-                conv->add_input(name);
-                conv->add_input(convWeights);
-                conv->set_op("Conv2D");
-                li -= 1;
+                tensorflow::NodeDef conv = layer_;
+                conv.clear_input();
+                conv.add_input(name);
+                conv.add_input(convWeights);
+                conv.set_op("Conv2D");
+                parseNode(conv);
             }
         }
         else if (type == "L2Normalize")
@@ -1983,7 +2171,7 @@ void TFImporter::populateNet(Net dstNet)
             // op: "L2Normalize"
             // input: "input"
             // input: "reduction_indices" (axis)
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(reductionIndices.type() == CV_32SC1);
 
@@ -2008,6 +2196,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "PriorBox")
         {
+            CV_CheckEQ(num_inputs, 2, "");
             if (hasLayerAttr(layer, "min_size"))
                 layerParams.set("min_size", getLayerAttr(layer, "min_size").i());
             if (hasLayerAttr(layer, "max_size"))
@@ -2040,12 +2229,13 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Softmax")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if (hasLayerAttr(layer, "axis"))
                 layerParams.set("axis", getLayerAttr(layer, "axis").i());
 
             int id = dstNet.addLayer(name, "Softmax", layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "CropAndResize")
         {
@@ -2053,7 +2243,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "boxes"
             // input: "sizes"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_CheckTypeEQ(cropSize.type(), CV_32SC1, ""); CV_CheckEQ(cropSize.total(), (size_t)2, "");
@@ -2081,6 +2271,7 @@ void TFImporter::populateNet(Net dstNet)
             // determine out shape: NxCxHxW --Slice--> 1xCxHxW
             //                      out_shape = 1xCxHxW if keepDims else (1xCxHxW --Flatten--> CxHxW)
             // global pool: NxCxHxW --Flatten--> Nx(C*H*W) --Reshape--> 1x1xNx(C*H*W) --Pooling--> 1x1x1x(C*H*W) --Reshape--> out_shape
+            CV_CheckGT(num_inputs, 0, "");
 
             Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(indices.type() == CV_32SC1);
@@ -2169,12 +2360,9 @@ void TFImporter::populateNet(Net dstNet)
                         // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
                         LayerParams permLP;
                         int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
                         std::string permName = name + "/nchw";
-                        CV_Assert(layer_id.find(permName) == layer_id.end());
-                        int permId = dstNet.addLayer(permName, "Permute", permLP);
-                        layer_id[permName] = permId;
-                        connect(layer_id, dstNet, Pin(name), permId, 0);
+                        Pin inpId = Pin(name);
+                        addPermuteLayer(order, permName, inpId);
 
                         LayerParams squeezeLp;
                         std::string squeezeName = name + "/squeeze";
@@ -2186,6 +2374,38 @@ void TFImporter::populateNet(Net dstNet)
                         connect(layer_id, dstNet, Pin(permName), squeezeId, 0);
                     }
                 }
+                else if (axis == 1)
+                {
+                    int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                    Pin inpId = parsePin(layer.input(0));
+                    addPermuteLayer(order, name + "/nhwc", inpId);
+
+                    layerParams.set("pool", type == "Mean" ? "ave" : "sum");
+                    layerParams.set("kernel_h", 1);
+                    layerParams.set("global_pooling_w", true);
+                    int id = dstNet.addLayer(name, "Pooling", layerParams);
+                    layer_id[name] = id;
+                    connect(layer_id, dstNet, inpId, id, 0);
+
+                    if (!keepDims)
+                    {
+                        LayerParams squeezeLp;
+                        std::string squeezeName = name + "/squeeze";
+                        CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                        int channel_id = 3; // TF NHWC layout
+                        squeezeLp.set("axis", channel_id - 1);
+                        squeezeLp.set("end_axis", channel_id);
+                        int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+                        layer_id[squeezeName] = squeezeId;
+                        connect(layer_id, dstNet, Pin(name), squeezeId, 0);
+                    }
+                    else
+                    {
+                        int order[] = {0, 3, 1, 2};  // From NHWC to OpenCV's NCHW.
+                        Pin inpId = parsePin(name);
+                        addPermuteLayer(order, name + "/nchw", inpId);
+                    }
+                }
             } else {
                 if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
                     CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean or reduce_sum operation.");
@@ -2215,6 +2435,7 @@ void TFImporter::populateNet(Net dstNet)
             // Example: given a list with "N" tensors of shape (C, H, W):
             // if axis == 0 then the output tensor will have the shape (N, C, H, W),
             // if axis == 1 then the output tensor will have the shape (C, N, H, W).
+            CV_CheckGT(num_inputs, 0, "");
             CV_Assert(hasLayerAttr(layer, "axis"));
             int dim = (int)getLayerAttr(layer, "axis").i();
             if (dim != 0)
@@ -2222,7 +2443,7 @@ void TFImporter::populateNet(Net dstNet)
 
             CV_Assert(hasLayerAttr(layer, "N"));
             int num = (int)getLayerAttr(layer, "N").i();
-            CV_Assert(layer.input_size() == num);
+            CV_CheckEQ(num_inputs, num, "");
             std::string base_name = name + "/reshape_";
             std::vector<int> reshape_ids;
             for (int i = 0; i < num; i++) {
@@ -2253,7 +2474,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "mix"
             // input: "max"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat minValue = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat maxValue = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -2268,10 +2489,21 @@ void TFImporter::populateNet(Net dstNet)
 
             connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
         }
+        else if (type == "LeakyRelu")
+        {
+            CV_CheckGT(num_inputs, 0, "");
+            CV_Assert(hasLayerAttr(layer, "alpha"));
+            layerParams.set("negative_slope", getLayerAttr(layer, "alpha").f());
+
+            int id = dstNet.addLayer(name, "ReLU", layerParams);
+            layer_id[name] = id;
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+        }
         else if (type == "Abs" || type == "Tanh" || type == "Sigmoid" ||
-                 type == "Relu" || type == "Elu" ||
+                 type == "Relu" || type == "Elu" || type == "Exp" ||
                  type == "Identity" || type == "Relu6")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string dnnType = type;
             if (type == "Abs") dnnType = "AbsVal";
             else if (type == "Tanh") dnnType = "TanH";
@@ -2281,7 +2513,7 @@ void TFImporter::populateNet(Net dstNet)
 
             int id = dstNet.addLayer(name, dnnType, layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else
         {
@@ -2305,7 +2537,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // All the Const input nodes are added to layer's blobs.
             std::vector<std::string> inputsNames;
-            for (int i = 0; i < layer.input_size(); ++i)
+            for (int i = 0; i < num_inputs; ++i)
             {
                 // Check if input is a Const node.
                 if (value_id.find(layer.input(i)) != value_id.end())
@@ -2325,7 +2557,11 @@ void TFImporter::populateNet(Net dstNet)
             }
         }
     }
-    dstNet.setInputsNames(netInputsNames);
+    catch (const std::exception& e)
+    {
+        CV_LOG_ERROR(NULL, "DNN/TF: Can't parse layer for node='" << name << "'. Exception: " << e.what());
+        throw;
+    }
 }
 
 } // namespace
@@ -2334,18 +2570,16 @@ void TFImporter::populateNet(Net dstNet)
 
 Net readNetFromTensorflow(const String &model, const String &config)
 {
-    TFImporter importer(model.c_str(), config.c_str());
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, model.c_str(), config.c_str());
     return net;
 }
 
 Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
                           const char* bufferConfig, size_t lenConfig)
 {
-    TFImporter importer(bufferModel, lenModel, bufferConfig, lenConfig);
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, bufferModel, lenModel, bufferConfig, lenConfig);
     return net;
 }
 
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 67f5782a2e..aab4c6f507 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -101,6 +101,9 @@ public:
 TEST_P(DNNTestNetwork, AlexNet)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_1GB);
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
                Size(227, 227), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_alexnet.yml" :
@@ -115,6 +118,9 @@ TEST_P(DNNTestNetwork, ResNet_50)
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
         CV_TEST_TAG_DEBUG_LONG
     );
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
                Size(224, 224), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_resnet_50.yml" :
@@ -125,6 +131,9 @@ TEST_P(DNNTestNetwork, ResNet_50)
 
 TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 {
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
                Size(227, 227), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml" :
@@ -136,6 +145,9 @@ TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 TEST_P(DNNTestNetwork, GoogLeNet)
 {
     applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
                Size(224, 224), "prob");
     expectNoFallbacksFromIE(net);
@@ -145,6 +157,9 @@ TEST_P(DNNTestNetwork, GoogLeNet)
 TEST_P(DNNTestNetwork, Inception_5h)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     double l1 = default_l1, lInf = default_lInf;
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_CPU || target == DNN_TARGET_OPENCL))
     {
@@ -162,6 +177,9 @@ TEST_P(DNNTestNetwork, Inception_5h)
 TEST_P(DNNTestNetwork, ENet)
 {
     applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
+    if (backend == DNN_BACKEND_HALIDE)  // Realization contains wrong number of Images (1) for realizing pipeline with 2 outputs
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
+
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp
index 3bc8fc3a89..139f3d1671 100644
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@@ -30,11 +30,13 @@
 #define CV_TEST_TAG_DNN_SKIP_IE_2019R1_1         "dnn_skip_ie_2019r1_1"
 #define CV_TEST_TAG_DNN_SKIP_IE_2019R2           "dnn_skip_ie_2019r2"
 #define CV_TEST_TAG_DNN_SKIP_IE_2019R3           "dnn_skip_ie_2019r3"
+#define CV_TEST_TAG_DNN_SKIP_IE_CPU              "dnn_skip_ie_cpu"
 #define CV_TEST_TAG_DNN_SKIP_IE_OPENCL           "dnn_skip_ie_ocl"
 #define CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16      "dnn_skip_ie_ocl_fp16"
 #define CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_2         "dnn_skip_ie_myriad2"
 #define CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X         "dnn_skip_ie_myriadx"
 #define CV_TEST_TAG_DNN_SKIP_IE_MYRIAD           CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_2, CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X
+#define CV_TEST_TAG_DNN_SKIP_IE_ARM_CPU          "dnn_skip_ie_arm_cpu"
 
 #define CV_TEST_TAG_DNN_SKIP_VULKAN              "dnn_skip_vulkan"
 
@@ -113,6 +115,14 @@ void normAssertDetections(
         double confThreshold = 0.0, double scores_diff = 1e-5,
         double boxes_iou_diff = 1e-4);
 
+// For text detection networks
+// Curved text polygon is not supported in the current version.
+// (concave polygon is invalid input to intersectConvexConvex)
+void normAssertTextDetections(
+        const std::vector<std::vector<Point>>& gtPolys,
+        const std::vector<std::vector<Point>>& testPolys,
+        const char *comment = "", double boxes_iou_diff = 1e-4);
+
 void readFileContent(const std::string& filename, CV_OUT std::vector<char>& content);
 
 #ifdef HAVE_INF_ENGINE
diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp
index cf1b558391..3d56e6f308 100644
--- a/modules/dnn/test/test_common.impl.hpp
+++ b/modules/dnn/test/test_common.impl.hpp
@@ -177,6 +177,52 @@ void normAssertDetections(
                          testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff);
 }
 
+// For text detection networks
+// Curved text polygon is not supported in the current version.
+// (concave polygon is invalid input to intersectConvexConvex)
+void normAssertTextDetections(
+        const std::vector<std::vector<Point>>& gtPolys,
+        const std::vector<std::vector<Point>>& testPolys,
+        const char *comment /*= ""*/, double boxes_iou_diff /*= 1e-4*/)
+{
+    std::vector<bool> matchedRefBoxes(gtPolys.size(), false);
+    for (uint i = 0; i < testPolys.size(); ++i)
+    {
+        const std::vector<Point>& testPoly = testPolys[i];
+        bool matched = false;
+        double topIoU = 0;
+        for (uint j = 0; j < gtPolys.size() && !matched; ++j)
+        {
+            if (!matchedRefBoxes[j])
+            {
+                std::vector<Point> intersectionPolygon;
+                float intersectArea = intersectConvexConvex(testPoly, gtPolys[j], intersectionPolygon, true);
+                double iou = intersectArea / (contourArea(testPoly) + contourArea(gtPolys[j]) - intersectArea);
+                topIoU = std::max(topIoU, iou);
+                if (1.0 - iou < boxes_iou_diff)
+                {
+                    matched = true;
+                    matchedRefBoxes[j] = true;
+                }
+            }
+        }
+        if (!matched) {
+            std::cout << cv::format("Unmatched-det:") << testPoly << std::endl;
+            std::cout << "Highest IoU: " << topIoU << std::endl;
+        }
+        EXPECT_TRUE(matched) << comment;
+    }
+
+    // Check unmatched groundtruth.
+    for (uint i = 0; i < gtPolys.size(); ++i)
+    {
+        if (!matchedRefBoxes[i]) {
+            std::cout << cv::format("Unmatched-gt:") << gtPolys[i] << std::endl;
+        }
+        EXPECT_TRUE(matchedRefBoxes[i]);
+    }
+}
+
 void readFileContent(const std::string& filename, CV_OUT std::vector<char>& content)
 {
     const std::ios::openmode mode = std::ios::in | std::ios::binary;
@@ -407,13 +453,13 @@ void initDNNTests()
 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
         CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER,
 #endif
-        ""
+        CV_TEST_TAG_DNN_SKIP_IE_CPU
     );
-#endif
     registerGlobalSkipTag(
         // see validateVPUType(): CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_2, CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X
         CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16
     );
+#endif
 #ifdef HAVE_VULKAN
     registerGlobalSkipTag(
         CV_TEST_TAG_DNN_SKIP_VULKAN
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 021603636e..f2b30c9b87 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -656,7 +656,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
         target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB
     );
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021010000)  // nGraph compilation failure
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)  // nGraph compilation failure
     if (target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
@@ -727,6 +727,10 @@ TEST_P(Test_Darknet_layers, shortcut)
 
 TEST_P(Test_Darknet_layers, upsample)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     testDarknetLayer("upsample");
 }
 
@@ -735,6 +739,11 @@ TEST_P(Test_Darknet_layers, mish)
     testDarknetLayer("mish", true);
 }
 
+TEST_P(Test_Darknet_layers, tanh)
+{
+    testDarknetLayer("tanh");
+}
+
 TEST_P(Test_Darknet_layers, avgpool_softmax)
 {
     testDarknetLayer("avgpool_softmax");
@@ -798,6 +807,11 @@ TEST_P(Test_Darknet_layers, relu)
     testDarknetLayer("relu");
 }
 
+TEST_P(Test_Darknet_layers, sam)
+{
+    testDarknetLayer("sam", true);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index 7e6d7f87d2..165ee4d67b 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -258,7 +258,17 @@ TEST_P(LRN, Accuracy)
 
     int sz[] = {1, inChannels, inSize.height, inSize.width};
     Mat input(4, &sz[0], CV_32F);
-    test(lp, input, backendId, targetId);
+
+    double l1 = 0.0, lInf = 0.0;
+    // The OpenCL kernels use the native_ math functions which have
+    // implementation defined accuracy, so we use relaxed thresholds. See
+    // https://github.com/opencv/opencv/issues/9821 for more details.
+    if (targetId == DNN_TARGET_OPENCL)
+    {
+        l1 = 0.01;
+        lInf = 0.01;
+    }
+    test(lp, input, backendId, targetId, false, l1, lInf);
 }
 
 INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, LRN, Combine(
@@ -632,6 +642,31 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Power, Combine(
                         dnnBackendsAndTargetsWithHalide()
 ));
 
+typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Exp;
+TEST_P(Exp, Accuracy)
+{
+    float base = get<0>(GetParam())[0];
+    float scale = get<0>(GetParam())[1];
+    float shift = get<0>(GetParam())[2];
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    LayerParams lp;
+    lp.set("base", base);
+    lp.set("scale", scale);
+    lp.set("shift", shift);
+    lp.type = "Exp";
+    lp.name = "testLayer";
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Exp, Combine(
+/*base, scale, shift*/ Values(Vec3f(0.9f, -1.0f, 1.1f), Vec3f(0.9f, 1.1f, -1.0f),
+                              Vec3f(-1.0f, 0.9f, 1.1f), Vec3f(-1.0f, 1.1f, 0.9f),
+                              Vec3f(1.1f, 0.9f, -1.0f), Vec3f(1.1f, -1.0f, 0.9f)),
+                       dnnBackendsAndTargetsWithHalide()
+));
+
 TEST_P(Test_Halide_layers, ChannelsPReLU)
 {
     LayerParams lp;
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 61537e0e01..20d3fb41eb 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -169,8 +169,17 @@ TEST_P(Test_Caffe_layers, Softmax)
 
 TEST_P(Test_Caffe_layers, LRN)
 {
-    testLayerUsingCaffeModels("layer_lrn_spatial");
-    testLayerUsingCaffeModels("layer_lrn_channels");
+    double l1 = 0.0, lInf = 0.0;
+    // The OpenCL kernels use the native_ math functions which have
+    // implementation defined accuracy, so we use relaxed thresholds. See
+    // https://github.com/opencv/opencv/issues/9821 for more details.
+    if (target == DNN_TARGET_OPENCL)
+    {
+        l1 = 0.01;
+        lInf = 0.01;
+    }
+    testLayerUsingCaffeModels("layer_lrn_spatial", false, true, l1, lInf);
+    testLayerUsingCaffeModels("layer_lrn_channels", false, true, l1, lInf);
 }
 
 TEST_P(Test_Caffe_layers, Convolution)
@@ -1583,6 +1592,11 @@ TEST_P(Test_Caffe_layers, Interp)
 TEST_P(Test_Caffe_layers, DISABLED_Interp)  // requires patched protobuf (available in OpenCV source tree only)
 #endif
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
+
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
 
@@ -2152,6 +2166,12 @@ public:
             randu(scales, -1.0f, 1.0f);
             activationParams.blobs.push_back(scales);
         }
+        else if (activationParams.type == "Exp")
+        {
+            activationParams.set("base", -1.0f);
+            activationParams.set("scale", 0.3f);
+            activationParams.set("shift", 0.6f);
+        }
     }
 
     static void makeDefaultTestEltwiseLayer(LayerParams& eltwiseParams, const std::string& op, bool withCoefficients)
@@ -2223,7 +2243,7 @@ public:
     static testing::internal::ParamGenerator<std::string> activationLayersList()
     {
         // TODO: automate list generation
-        return Values("ReLU", "ReLU6", "ChannelsPReLU", "TanH", "Swish", "Mish", "Sigmoid", "ELU", "AbsVal", "BNLL", "Power");
+        return Values("ReLU", "ReLU6", "ChannelsPReLU", "TanH", "Swish", "Mish", "Sigmoid", "ELU", "AbsVal", "BNLL", "Power", "Exp");
     }
 
     static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsForFusionTests()
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index 7d516de73e..f7befa9937 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -25,7 +25,8 @@ public:
                          double scoreDiff, double iouDiff,
                          double confThreshold = 0.24, double nmsThreshold = 0.0,
                          const Size& size = {-1, -1}, Scalar mean = Scalar(),
-                         double scale = 1.0, bool swapRB = false, bool crop = false)
+                         double scale = 1.0, bool swapRB = false, bool crop = false,
+                         bool nmsAcrossClasses = false)
     {
         checkBackend();
 
@@ -38,6 +39,8 @@ public:
         model.setPreferableBackend(backend);
         model.setPreferableTarget(target);
 
+        model.setNmsAcrossClasses(nmsAcrossClasses);
+
         std::vector<int> classIds;
         std::vector<float> confidences;
         std::vector<Rect> boxes;
@@ -110,6 +113,156 @@ public:
         model.segment(frame, mask);
         normAssert(mask, exp, "", norm, norm);
     }
+
+    void testTextRecognitionModel(const std::string& weights, const std::string& cfg,
+                                  const std::string& imgPath, const std::string& seq,
+                                  const std::string& decodeType, const std::vector<std::string>& vocabulary,
+                                  const Size& size = {-1, -1}, Scalar mean = Scalar(),
+                                  double scale = 1.0, bool swapRB = false, bool crop = false)
+    {
+        checkBackend();
+
+        Mat frame = imread(imgPath, IMREAD_GRAYSCALE);
+
+        TextRecognitionModel model(weights, cfg);
+        model.setDecodeType(decodeType)
+             .setVocabulary(vocabulary)
+             .setInputSize(size).setInputMean(mean).setInputScale(scale)
+             .setInputSwapRB(swapRB).setInputCrop(crop);
+
+        model.setPreferableBackend(backend);
+        model.setPreferableTarget(target);
+
+        std::string result = model.recognize(frame);
+        EXPECT_EQ(result, seq) << "Full frame: " << imgPath;
+
+        std::vector<Rect> rois;
+        rois.push_back(Rect(0, 0, frame.cols, frame.rows));
+        rois.push_back(Rect(0, 0, frame.cols, frame.rows));  // twice
+        std::vector<std::string> results;
+        model.recognize(frame, rois, results);
+        EXPECT_EQ((size_t)2u, results.size()) << "ROI: " << imgPath;
+        EXPECT_EQ(results[0], seq) << "ROI[0]: " << imgPath;
+        EXPECT_EQ(results[1], seq) << "ROI[1]: " << imgPath;
+    }
+
+    void testTextDetectionModelByDB(const std::string& weights, const std::string& cfg,
+                                    const std::string& imgPath, const std::vector<std::vector<Point>>& gt,
+                                    float binThresh, float polyThresh,
+                                    uint maxCandidates, double unclipRatio,
+                                    const Size& size = {-1, -1}, Scalar mean = Scalar(),
+                                    double scale = 1.0, bool swapRB = false, bool crop = false)
+    {
+        checkBackend();
+
+        Mat frame = imread(imgPath);
+
+        TextDetectionModel_DB model(weights, cfg);
+        model.setBinaryThreshold(binThresh)
+             .setPolygonThreshold(polyThresh)
+             .setUnclipRatio(unclipRatio)
+             .setMaxCandidates(maxCandidates)
+             .setInputSize(size).setInputMean(mean).setInputScale(scale)
+             .setInputSwapRB(swapRB).setInputCrop(crop);
+
+        model.setPreferableBackend(backend);
+        model.setPreferableTarget(target);
+
+        // 1. Check common TextDetectionModel API through RotatedRect
+        std::vector<cv::RotatedRect> results;
+        model.detectTextRectangles(frame, results);
+
+        EXPECT_GT(results.size(), (size_t)0);
+
+        std::vector< std::vector<Point> > contours;
+        for (size_t i = 0; i < results.size(); i++)
+        {
+            const RotatedRect& box = results[i];
+            Mat contour;
+            boxPoints(box, contour);
+            std::vector<Point> contour2i(4);
+            for (int i = 0; i < 4; i++)
+            {
+                contour2i[i].x = cvRound(contour.at<float>(i, 0));
+                contour2i[i].y = cvRound(contour.at<float>(i, 1));
+            }
+            contours.push_back(contour2i);
+        }
+#if 0 // test debug
+        Mat result = frame.clone();
+        drawContours(result, contours, -1, Scalar(0, 0, 255), 1);
+        imshow("result", result); // imwrite("result.png", result);
+        waitKey(0);
+#endif
+        normAssertTextDetections(gt, contours, "", 0.05f);
+
+        // 2. Check quadrangle-based API
+        // std::vector< std::vector<Point> > contours;
+        model.detect(frame, contours);
+
+#if 0 // test debug
+        Mat result = frame.clone();
+        drawContours(result, contours, -1, Scalar(0, 0, 255), 1);
+        imshow("result_contours", result); // imwrite("result_contours.png", result);
+        waitKey(0);
+#endif
+        normAssertTextDetections(gt, contours, "", 0.05f);
+    }
+
+    void testTextDetectionModelByEAST(
+            const std::string& weights, const std::string& cfg,
+            const std::string& imgPath, const std::vector<RotatedRect>& gt,
+            float confThresh, float nmsThresh,
+            const Size& size = {-1, -1}, Scalar mean = Scalar(),
+            double scale = 1.0, bool swapRB = false, bool crop = false,
+            double eps_center = 5/*pixels*/, double eps_size = 5/*pixels*/, double eps_angle = 1
+    )
+    {
+        checkBackend();
+
+        Mat frame = imread(imgPath);
+
+        TextDetectionModel_EAST model(weights, cfg);
+        model.setConfidenceThreshold(confThresh)
+             .setNMSThreshold(nmsThresh)
+             .setInputSize(size).setInputMean(mean).setInputScale(scale)
+             .setInputSwapRB(swapRB).setInputCrop(crop);
+
+        model.setPreferableBackend(backend);
+        model.setPreferableTarget(target);
+
+        std::vector<cv::RotatedRect> results;
+        model.detectTextRectangles(frame, results);
+
+        EXPECT_EQ(results.size(), (size_t)1);
+        for (size_t i = 0; i < results.size(); i++)
+        {
+            const RotatedRect& box = results[i];
+#if 0 // test debug
+            Mat contour;
+            boxPoints(box, contour);
+            std::vector<Point> contour2i(4);
+            for (int i = 0; i < 4; i++)
+            {
+                contour2i[i].x = cvRound(contour.at<float>(i, 0));
+                contour2i[i].y = cvRound(contour.at<float>(i, 1));
+            }
+            std::vector< std::vector<Point> > contours;
+            contours.push_back(contour2i);
+
+            Mat result = frame.clone();
+            drawContours(result, contours, -1, Scalar(0, 0, 255), 1);
+            imshow("result", result); //imwrite("result.png", result);
+            waitKey(0);
+#endif
+            const RotatedRect& gtBox = gt[i];
+            EXPECT_NEAR(box.center.x, gtBox.center.x, eps_center);
+            EXPECT_NEAR(box.center.y, gtBox.center.y, eps_center);
+            EXPECT_NEAR(box.size.width, gtBox.size.width, eps_size);
+            EXPECT_NEAR(box.size.height, gtBox.size.height, eps_size);
+            EXPECT_NEAR(box.angle, gtBox.angle, eps_angle);
+        }
+    }
 };
 
 TEST_P(Test_Model, Classify)
@@ -177,6 +330,58 @@ TEST_P(Test_Model, DetectRegion)
                     Scalar(), scale, swapRB);
 }
 
+TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
+{
+    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_MEMORY_1GB);
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#endif
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    std::vector<int> refClassIds = { 6, 11 };
+    std::vector<float> refConfidences = { 0.750469f, 0.901615f };
+    std::vector<Rect2d> refBoxes = { Rect2d(240, 53, 135, 72),
+                                    Rect2d(58, 141, 117, 249) };
+
+    std::string img_path = _tf("dog416.png");
+    std::string weights_file = _tf("yolo-voc.weights", false);
+    std::string config_file = _tf("yolo-voc.cfg");
+
+    double scale = 1.0 / 255.0;
+    Size size{ 416, 416 };
+    bool swapRB = true;
+    bool crop = false;
+    bool nmsAcrossClasses = true;
+
+    double confThreshold = 0.24;
+    double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15;
+    double scoreDiff = 8e-5, iouDiff = 1e-5;
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    {
+        scoreDiff = 1e-2;
+        iouDiff = 1.6e-2;
+    }
+
+    testDetectModel(weights_file, config_file, img_path, refClassIds, refConfidences,
+        refBoxes, scoreDiff, iouDiff, confThreshold, nmsThreshold, size,
+        Scalar(), scale, swapRB, crop,
+        nmsAcrossClasses);
+}
+
 TEST_P(Test_Model, DetectionOutput)
 {
 #if defined(INF_ENGINE_RELEASE)
@@ -391,6 +596,87 @@ TEST_P(Test_Model, Segmentation)
     testSegmentationModel(weights_file, config_file, inp, exp, norm, size, mean, scale, swapRB);
 }
 
+TEST_P(Test_Model, TextRecognition)
+{
+    if (target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    std::string imgPath = _tf("text_rec_test.png");
+    std::string weightPath = _tf("onnx/models/crnn.onnx", false);
+    std::string seq = "welcome";
+
+    Size size{100, 32};
+    double scale = 1.0 / 127.5;
+    Scalar mean = Scalar(127.5);
+    std::string decodeType = "CTC-greedy";
+    std::vector<std::string> vocabulary = {"0","1","2","3","4","5","6","7","8","9",
+                                           "a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"};
+
+    testTextRecognitionModel(weightPath, "", imgPath, seq, decodeType, vocabulary, size, mean, scale);
+}
+
+TEST_P(Test_Model, TextDetectionByDB)
+{
+    if (target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    std::string imgPath = _tf("text_det_test1.png");
+    std::string weightPath = _tf("onnx/models/DB_TD500_resnet50.onnx", false);
+
+    // GroundTruth
+    std::vector<std::vector<Point>> gt = {
+        { Point(142, 193), Point(136, 164), Point(213, 150), Point(219, 178) },
+        { Point(136, 165), Point(122, 114), Point(319, 71), Point(330, 122) }
+    };
+
+    Size size{736, 736};
+    double scale = 1.0 / 255.0;
+    Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
+
+    float binThresh = 0.3;
+    float polyThresh = 0.5;
+    uint maxCandidates = 200;
+    double unclipRatio = 2.0;
+
+    testTextDetectionModelByDB(weightPath, "", imgPath, gt, binThresh, polyThresh, maxCandidates, unclipRatio, size, mean, scale);
+}
+
+TEST_P(Test_Model, TextDetectionByEAST)
+{
+    std::string imgPath = _tf("text_det_test2.jpg");
+    std::string weightPath = _tf("frozen_east_text_detection.pb", false);
+
+    // GroundTruth
+    std::vector<RotatedRect> gt = {
+        RotatedRect(Point2f(657.55f, 409.5f), Size2f(316.84f, 62.45f), -4.79)
+    };
+
+    // Model parameters
+    Size size{320, 320};
+    double scale = 1.0;
+    Scalar mean = Scalar(123.68, 116.78, 103.94);
+    bool swapRB = true;
+
+    // Detection algorithm parameters
+    float confThresh = 0.5;
+    float nmsThresh = 0.4;
+
+    double eps_center = 5/*pixels*/;
+    double eps_size = 5/*pixels*/;
+    double eps_angle = 1;
+
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD)
+    {
+        eps_center = 10;
+        eps_size = 25;
+        eps_angle = 3;
+    }
+
+    testTextDetectionModelByEAST(weightPath, "", imgPath, gt, confThresh, nmsThresh, size, mean, scale, swapRB, false/*crop*/,
+        eps_center, eps_size, eps_angle
+    );
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Model, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index a2c097da42..81ea1dcdd0 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -122,7 +122,8 @@ TEST_P(Test_ONNX_layers, Convolution_variable_weight)
 
     if (backend == DNN_BACKEND_CUDA)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); // not supported
-
+    if (backend == DNN_BACKEND_VKCOM)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN); // not supported
     String basename = "conv_variable_w";
     Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
     ASSERT_FALSE(net.empty());
@@ -152,6 +153,12 @@ TEST_P(Test_ONNX_layers, Convolution_variable_weight_bias)
 
     if (backend == DNN_BACKEND_CUDA)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); // not supported
+    if (backend == DNN_BACKEND_VKCOM)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN); // not supported
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_CPU &&
+        getInferenceEngineCPUType() == CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_ARM_CPU, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     String basename = "conv_variable_wb";
     Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
@@ -326,6 +333,13 @@ TEST_P(Test_ONNX_layers, Power)
     testONNXModels("pow2", npy, 0, 0, false, false);
 }
 
+TEST_P(Test_ONNX_layers, Exp)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    testONNXModels("exp");
+}
+
 TEST_P(Test_ONNX_layers, Concatenation)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@@ -426,14 +440,27 @@ TEST_P(Test_ONNX_layers, BatchNormalization3D)
 
 TEST_P(Test_ONNX_layers, BatchNormalizationUnfused)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_CPU)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_CPU, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     testONNXModels("frozenBatchNorm2d");
 }
 
 TEST_P(Test_ONNX_layers, BatchNormalizationSubgraph)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_CPU)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_CPU, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     testONNXModels("batch_norm_subgraph");
 }
 
+TEST_P(Test_ONNX_layers, NormalizeFusionSubgraph)
+{
+    testONNXModels("normalize_fusion");
+}
+
 TEST_P(Test_ONNX_layers, Transpose)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@@ -637,6 +664,26 @@ TEST_P(Test_ONNX_layers, Slice)
 #endif
 }
 
+TEST_P(Test_ONNX_layers, Slice_Steps_2DInput)
+{
+    testONNXModels("slice_opset_11_steps_2d");
+}
+
+TEST_P(Test_ONNX_layers, Slice_Steps_3DInput)
+{
+    testONNXModels("slice_opset_11_steps_3d");
+}
+
+TEST_P(Test_ONNX_layers, Slice_Steps_4DInput)
+{
+    testONNXModels("slice_opset_11_steps_4d");
+}
+
+TEST_P(Test_ONNX_layers, Slice_Steps_5DInput)
+{
+    testONNXModels("slice_opset_11_steps_5d");
+}
+
 TEST_P(Test_ONNX_layers, Softmax)
 {
     testONNXModels("softmax");
@@ -698,6 +745,16 @@ TEST_P(Test_ONNX_layers, ResizeOpset11_Torch1_6)
     testONNXModels("resize_opset11_torch1.6");
 }
 
+TEST_P(Test_ONNX_layers, Mish)
+{
+    testONNXModels("mish");
+}
+
+TEST_P(Test_ONNX_layers, CalculatePads)
+{
+    testONNXModels("calc_pads");
+}
+
 TEST_P(Test_ONNX_layers, Conv1d)
 {
     testONNXModels("conv1d");
@@ -710,6 +767,10 @@ TEST_P(Test_ONNX_layers, Conv1d_bias)
 
 TEST_P(Test_ONNX_layers, Conv1d_variable_weight)
 {
+    if (backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); // not supported
+    if (backend == DNN_BACKEND_VKCOM)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN); // not supported
     String basename = "conv1d_variable_w";
     Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
     ASSERT_FALSE(net.empty());
@@ -730,9 +791,15 @@ TEST_P(Test_ONNX_layers, Conv1d_variable_weight)
 
 TEST_P(Test_ONNX_layers, Conv1d_variable_weight_bias)
 {
+    if (backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); // not supported
+    if (backend == DNN_BACKEND_VKCOM)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN); // not supported
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
         if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_CPU && getInferenceEngineCPUType() == CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_ARM_CPU, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     }
     String basename = "conv1d_variable_wb";
     Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
@@ -756,8 +823,12 @@ TEST_P(Test_ONNX_layers, Conv1d_variable_weight_bias)
 
 TEST_P(Test_ONNX_layers, GatherMultiOutput)
 {
-    if (cvtest::skipUnstableTests && backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("Skip unstable test: https://github.com/opencv/opencv/issues/18937");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
 
 #if defined(INF_ENGINE_RELEASE)
     if (target == DNN_TARGET_MYRIAD)
@@ -855,6 +926,7 @@ TEST_P(Test_ONNX_layers, PoolConv1d)
 
 TEST_P(Test_ONNX_layers, ConvResizePool1d)
 {
+#if defined(INF_ENGINE_RELEASE)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
     {
         if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
@@ -862,7 +934,12 @@ TEST_P(Test_ONNX_layers, ConvResizePool1d)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
         if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#if INF_ENGINE_VER_MAJOR_EQ(2021030000)
+        if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     }
+#endif
     testONNXModels("conv_resize_pool_1d");
 }
 
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index e6cfbe6637..2c36134724 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -81,12 +81,12 @@ class Test_TensorFlow_layers : public DNNTestLayer
 {
 public:
     void runTensorFlowNet(const std::string& prefix, bool hasText = false,
-                          double l1 = 0.0, double lInf = 0.0, bool memoryLoad = false)
+                          double l1 = 0.0, double lInf = 0.0, bool memoryLoad = false, const std::string& groupPrefix = "")
     {
-        std::string netPath = path(prefix + "_net.pb");
-        std::string netConfig = (hasText ? path(prefix + "_net.pbtxt") : "");
+        std::string netPath = path(prefix + groupPrefix + "_net.pb");
+        std::string netConfig = (hasText ? path(prefix + groupPrefix + "_net.pbtxt") : "");
         std::string inpPath = path(prefix + "_in.npy");
-        std::string outPath = path(prefix + "_out.npy");
+        std::string outPath = path(prefix + groupPrefix + "_out.npy");
 
         cv::Mat input = blobFromNPY(inpPath);
         cv::Mat ref = blobFromNPY(outPath);
@@ -135,6 +135,16 @@ TEST_P(Test_TensorFlow_layers, reduce_sum)
     runTensorFlowNet("sum_pool_by_axis");
 }
 
+TEST_P(Test_TensorFlow_layers, reduce_sum_channel)
+{
+    runTensorFlowNet("reduce_sum_channel");
+}
+
+TEST_P(Test_TensorFlow_layers, reduce_sum_channel_keep_dims)
+{
+    runTensorFlowNet("reduce_sum_channel", false, 0.0, 0.0, false, "_keep_dims");
+}
+
 TEST_P(Test_TensorFlow_layers, conv_single_conv)
 {
     runTensorFlowNet("single_conv");
@@ -205,6 +215,17 @@ TEST_P(Test_TensorFlow_layers, eltwise)
     runTensorFlowNet("eltwise_sub");
 }
 
+TEST_P(Test_TensorFlow_layers, eltwise_add_vec)
+{
+    runTensorFlowNet("eltwise_add_vec");
+}
+
+TEST_P(Test_TensorFlow_layers, eltwise_mul_vec)
+{
+    runTensorFlowNet("eltwise_mul_vec");
+}
+
+
 TEST_P(Test_TensorFlow_layers, channel_broadcast)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@@ -219,6 +240,12 @@ TEST_P(Test_TensorFlow_layers, pad_and_concat)
 
 TEST_P(Test_TensorFlow_layers, concat_axis_1)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     runTensorFlowNet("concat_axis_1");
 }
 
@@ -279,6 +306,10 @@ TEST_P(Test_TensorFlow_layers, batch_norm_10)
 }
 TEST_P(Test_TensorFlow_layers, batch_norm_11)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_CPU)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_CPU, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // nan
+#endif
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     runTensorFlowNet("mvn_batch_norm_1x1");
@@ -478,12 +509,28 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
     runTensorFlowNet("unfused_flatten_unknown_batch");
 }
 
+TEST_P(Test_TensorFlow_layers, reshape_layer)
+{
+    runTensorFlowNet("reshape_layer");
+}
+
+TEST_P(Test_TensorFlow_layers, reshape_nchw)
+{
+    runTensorFlowNet("reshape_nchw");
+}
+
+TEST_P(Test_TensorFlow_layers, reshape_conv)
+{
+    runTensorFlowNet("reshape_conv");
+}
+
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
+    runTensorFlowNet("leaky_relu");
     runTensorFlowNet("leaky_relu_order1");
     runTensorFlowNet("leaky_relu_order2");
     runTensorFlowNet("leaky_relu_order3");
@@ -1001,6 +1048,19 @@ TEST_P(Test_TensorFlow_layers, resize_nearest_neighbor)
     runTensorFlowNet("keras_upsampling2d");
 }
 
+TEST_P(Test_TensorFlow_layers, resize_nearest_neighbor_align_corners)
+{
+    runTensorFlowNet("resize_nearest_neighbor", false, 0.0, 0.0, false, "_align_corners");
+}
+
+TEST_P(Test_TensorFlow_layers, resize_nearest_neighbor_half_pixel)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    runTensorFlowNet("resize_nearest_neighbor", false, 0.0, 0.0, false, "_half_pixel");
+}
+
 TEST_P(Test_TensorFlow_layers, fused_resize_conv)
 {
     runTensorFlowNet("fused_resize_conv");
@@ -1056,10 +1116,61 @@ TEST_P(Test_TensorFlow_layers, keras_mobilenet_head)
     runTensorFlowNet("keras_learning_phase");
 }
 
+// TF case: align_corners=False, half_pixel_centers=False
 TEST_P(Test_TensorFlow_layers, resize_bilinear)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
     runTensorFlowNet("resize_bilinear");
+}
+
+// TF case: align_corners=True, half_pixel_centers=False
+TEST_P(Test_TensorFlow_layers, resize_bilinear_align_corners)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
+    runTensorFlowNet("resize_bilinear",
+                     false, 0.0, 0.0, false, // default parameters
+                     "_align_corners");
+}
+
+// TF case: align_corners=False, half_pixel_centers=True
+TEST_P(Test_TensorFlow_layers, resize_bilinear_half_pixel)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    runTensorFlowNet("resize_bilinear", false, 0.0, 0.0, false, "_half_pixel");
+}
+
+// TF case: align_corners=False, half_pixel_centers=False
+TEST_P(Test_TensorFlow_layers, resize_bilinear_factor)
+{
     runTensorFlowNet("resize_bilinear_factor");
+}
+
+// TF case: align_corners=False, half_pixel_centers=True
+TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_half_pixel)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_half_pixel");
+}
+
+// TF case: align_corners=True, half_pixel_centers=False
+TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_align_corners)
+{
+    runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_align_corners");
+}
+
+// TF case: align_corners=False, half_pixel_centers=False
+TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
+{
     runTensorFlowNet("resize_bilinear_down");
 }
 
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 54b7c1baa9..f1d636895b 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -258,6 +258,14 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
         l1 = 0.0042;
         lInf = 0.021;
     }
+    // The OpenCL kernels use the native_ math functions which have
+    // implementation defined accuracy, so we use relaxed thresholds. See
+    // https://github.com/opencv/opencv/issues/9821 for more details.
+    else if (target == DNN_TARGET_OPENCL)
+    {
+        l1 = 0.02;
+        lInf = 0.02;
+    }
     runTorchNet("net_conv_gemm_lrn", "", false, true, true, l1, lInf);
 }
 
@@ -282,6 +290,15 @@ TEST_P(Test_Torch_layers, net_padding)
 
 TEST_P(Test_Torch_layers, net_non_spatial)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // crash
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
+#endif
+
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
         (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
@@ -592,6 +609,11 @@ private:
 
 TEST_P(Test_Torch_layers, upsampling_nearest)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // TODO
+#endif
+
     // Test a custom layer.
     CV_DNN_REGISTER_LAYER_CLASS(SpatialUpSamplingNearest, SpatialUpSamplingNearestLayer);
     try
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 829d8ff898..343ffd81e4 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -481,8 +481,7 @@ article](http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions)).
 than union-find method; it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
 
 - the color image algorithm is taken from: @cite forssen2007maximally ; it should be much slower
-than grey image method ( 3~4 times ); the chi_table.h file is taken directly from paper's source
-code which is distributed under GPL.
+than grey image method ( 3~4 times )
 
 - (Python) A complete example showing the use of the %MSER detector can be found at samples/python/mser.py
 */
diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp
index d07e8bae83..c2215cd57c 100644
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@@ -325,13 +325,19 @@ void SimpleBlobDetectorImpl::detect(InputArray image, std::vector<cv::KeyPoint>&
 
         std::vector < Center > curCenters;
         findBlobs(grayscaleImage, binarizedImage, curCenters);
+        if(params.maxThreshold - params.minThreshold <= params.thresholdStep) {
+            // if the difference between min and max threshold is less than the threshold step
+            // we're only going to enter the loop once, so we need to add curCenters
+            // to ensure we still use minDistBetweenBlobs
+            centers.push_back(curCenters);
+        }
         std::vector < std::vector<Center> > newCenters;
         for (size_t i = 0; i < curCenters.size(); i++)
         {
             bool isNew = true;
             for (size_t j = 0; j < centers.size(); j++)
             {
-                double dist = norm(centers[j][ centers[j].size() / 2 ].location - curCenters[i].location);
+                double dist = norm(centers[j][centers[j].size() / 2 ].location - curCenters[i].location);
                 isNew = dist >= params.minDistBetweenBlobs && dist >= centers[j][ centers[j].size() / 2 ].radius && dist >= curCenters[i].radius;
                 if (!isNew)
                 {
diff --git a/modules/features2d/src/gftt.cpp b/modules/features2d/src/gftt.cpp
index 11ed29f39d..bc97fc1677 100644
--- a/modules/features2d/src/gftt.cpp
+++ b/modules/features2d/src/gftt.cpp
@@ -87,6 +87,7 @@ public:
         }
 
         std::vector<Point2f> corners;
+        std::vector<float> cornersQuality;
 
         if (_image.isUMat())
         {
@@ -97,7 +98,7 @@ public:
                 ugrayImage = _image.getUMat();
 
             goodFeaturesToTrack( ugrayImage, corners, nfeatures, qualityLevel, minDistance, _mask,
-                                 blockSize, gradSize, useHarrisDetector, k );
+                                 cornersQuality, blockSize, gradSize, useHarrisDetector, k );
         }
         else
         {
@@ -106,14 +107,14 @@ public:
                 cvtColor( image, grayImage, COLOR_BGR2GRAY );
 
             goodFeaturesToTrack( grayImage, corners, nfeatures, qualityLevel, minDistance, _mask,
-                                blockSize, gradSize, useHarrisDetector, k );
+                                 cornersQuality, blockSize, gradSize, useHarrisDetector, k );
         }
 
+        CV_Assert(corners.size() == cornersQuality.size());
+
         keypoints.resize(corners.size());
-        std::vector<Point2f>::const_iterator corner_it = corners.begin();
-        std::vector<KeyPoint>::iterator keypoint_it = keypoints.begin();
-        for( ; corner_it != corners.end() && keypoint_it != keypoints.end(); ++corner_it, ++keypoint_it )
-            *keypoint_it = KeyPoint( *corner_it, (float)blockSize );
+        for (size_t i = 0; i < corners.size(); i++)
+            keypoints[i] = KeyPoint(corners[i], (float)blockSize, -1, cornersQuality[i]);
 
     }
 
diff --git a/modules/features2d/src/mser.cpp b/modules/features2d/src/mser.cpp
index a37b4ea482..4fe07bd6eb 100644
--- a/modules/features2d/src/mser.cpp
+++ b/modules/features2d/src/mser.cpp
@@ -35,7 +35,7 @@
  *    it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
  * 3. the color image algorithm is taken from: Maximally Stable Colour Regions for Recognition and Match;
  *    it should be much slower than gray image method ( 3~4 times );
- *    the chi_table.h file is taken directly from paper's source code which is distributed under GPL.
+ *    the chi_table.h file is taken directly from paper's source code which is distributed under permissive BSD-like license: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
  * 4. though the name is *contours*, the result actually is a list of point set.
  */
 
diff --git a/modules/features2d/src/orb.cpp b/modules/features2d/src/orb.cpp
index 881fc01516..85d17cdd0d 100644
--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp
@@ -1025,15 +1025,20 @@ void ORB_Impl::detectAndCompute( InputArray _image, InputArray _mask,
     Mat imagePyramid, maskPyramid;
     UMat uimagePyramid, ulayerInfo;
 
-    int level_dy = image.rows + border*2;
-    Point level_ofs(0,0);
-    Size bufSize((cvRound(image.cols/getScale(0, firstLevel, scaleFactor)) + border*2 + 15) & -16, 0);
+    float level0_inv_scale = 1.0f / getScale(0, firstLevel, scaleFactor);
+    size_t level0_width = (size_t)cvRound(image.cols * level0_inv_scale);
+    size_t level0_height = (size_t)cvRound(image.rows * level0_inv_scale);
+    Size bufSize((int)alignSize(level0_width + border*2, 16), 0);  // TODO change alignment to 64
+
+    int level_dy = (int)level0_height + border*2;
+    Point level_ofs(0, 0);
 
     for( level = 0; level < nLevels; level++ )
     {
         float scale = getScale(level, firstLevel, scaleFactor);
         layerScale[level] = scale;
-        Size sz(cvRound(image.cols/scale), cvRound(image.rows/scale));
+        float inv_scale = 1.0f / scale;
+        Size sz(cvRound(image.cols * inv_scale), cvRound(image.rows * inv_scale));
         Size wholeSize(sz.width + border*2, sz.height + border*2);
         if( level_ofs.x + wholeSize.width > bufSize.width )
         {
diff --git a/modules/features2d/test/test_blobdetector.cpp b/modules/features2d/test/test_blobdetector.cpp
new file mode 100644
index 0000000000..56b7145862
--- /dev/null
+++ b/modules/features2d/test/test_blobdetector.cpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+TEST(Features2d_BlobDetector, bug_6667)
+{
+    cv::Mat image = cv::Mat(cv::Size(100, 100), CV_8UC1, cv::Scalar(255, 255, 255));
+    cv::circle(image, Point(50, 50), 20, cv::Scalar(0), -1);
+    SimpleBlobDetector::Params params;
+    params.minThreshold = 250;
+    params.maxThreshold = 260;
+    std::vector<KeyPoint> keypoints;
+
+    Ptr<SimpleBlobDetector> detector = SimpleBlobDetector::create(params);
+    detector->detect(image, keypoints);
+    ASSERT_NE((int) keypoints.size(), 0);
+}
+}} // namespace
diff --git a/modules/features2d/test/test_nearestneighbors.cpp b/modules/features2d/test/test_nearestneighbors.cpp
index 42fc6fe948..11cee9cea0 100644
--- a/modules/features2d/test/test_nearestneighbors.cpp
+++ b/modules/features2d/test/test_nearestneighbors.cpp
@@ -123,7 +123,7 @@ void NearestNeighborTest::run( int /*start_from*/ ) {
     Mat desc( featuresCount, dims, CV_32FC1 );
     ts->get_rng().fill( desc, RNG::UNIFORM, minValue, maxValue );
 
-    createModel( desc );
+    createModel( desc.clone() );  // .clone() is used to simulate dangling pointers problem: https://github.com/opencv/opencv/issues/17553
 
     tempCode = checkGetPoints( desc );
     if( tempCode != cvtest::TS::OK )
diff --git a/modules/features2d/test/test_orb.cpp b/modules/features2d/test/test_orb.cpp
index 868bee354c..8a4d9776b3 100644
--- a/modules/features2d/test/test_orb.cpp
+++ b/modules/features2d/test/test_orb.cpp
@@ -90,7 +90,7 @@ TEST(Features2D_ORB, _1996)
     ASSERT_EQ(0, roiViolations);
 }
 
-TEST(Features2D_ORB, crash)
+TEST(Features2D_ORB, crash_5031)
 {
     cv::Mat image = cv::Mat::zeros(cv::Size(1920, 1080), CV_8UC3);
 
@@ -123,4 +123,23 @@ TEST(Features2D_ORB, crash)
     ASSERT_NO_THROW(orb->compute(image, keypoints, descriptors));
 }
 
+
+TEST(Features2D_ORB, regression_16197)
+{
+    Mat img(Size(72, 72), CV_8UC1, Scalar::all(0));
+    Ptr<ORB> orbPtr = ORB::create();
+    orbPtr->setNLevels(5);
+    orbPtr->setFirstLevel(3);
+    orbPtr->setScaleFactor(1.8);
+    orbPtr->setPatchSize(8);
+    orbPtr->setEdgeThreshold(8);
+
+    std::vector<KeyPoint> kps;
+    Mat fv;
+
+    // exception in debug mode, crash in release
+    ASSERT_NO_THROW(orbPtr->detectAndCompute(img, noArray(), kps, fv));
+}
+
+
 }} // namespace
diff --git a/modules/flann/include/opencv2/flann/all_indices.h b/modules/flann/include/opencv2/flann/all_indices.h
index 2de18af24a..03877ab6ad 100644
--- a/modules/flann/include/opencv2/flann/all_indices.h
+++ b/modules/flann/include/opencv2/flann/all_indices.h
@@ -82,7 +82,7 @@ struct index_creator
             nnIndex = new LshIndex<Distance>(dataset, params, distance);
             break;
         default:
-            throw FLANNException("Unknown index type");
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
         }
 
         return nnIndex;
@@ -111,7 +111,7 @@ struct index_creator<False,VectorSpace,Distance>
             nnIndex = new LshIndex<Distance>(dataset, params, distance);
             break;
         default:
-            throw FLANNException("Unknown index type");
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
         }
 
         return nnIndex;
@@ -140,7 +140,7 @@ struct index_creator<False,False,Distance>
             nnIndex = new LshIndex<Distance>(dataset, params, distance);
             break;
         default:
-            throw FLANNException("Unknown index type");
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
         }
 
         return nnIndex;
diff --git a/modules/flann/include/opencv2/flann/autotuned_index.h b/modules/flann/include/opencv2/flann/autotuned_index.h
index 54a60a73d6..d90f739aff 100644
--- a/modules/flann/include/opencv2/flann/autotuned_index.h
+++ b/modules/flann/include/opencv2/flann/autotuned_index.h
@@ -34,7 +34,6 @@
 
 #include <sstream>
 
-#include "general.h"
 #include "nn_index.h"
 #include "ground_truth.h"
 #include "index_testing.h"
diff --git a/modules/flann/include/opencv2/flann/composite_index.h b/modules/flann/include/opencv2/flann/composite_index.h
index bcf0827c9f..f1af41ac26 100644
--- a/modules/flann/include/opencv2/flann/composite_index.h
+++ b/modules/flann/include/opencv2/flann/composite_index.h
@@ -33,7 +33,6 @@
 
 //! @cond IGNORED
 
-#include "general.h"
 #include "nn_index.h"
 #include "kdtree_index.h"
 #include "kmeans_index.h"
diff --git a/modules/flann/include/opencv2/flann/flann_base.hpp b/modules/flann/include/opencv2/flann/flann_base.hpp
index 0f23930024..258ec38d20 100644
--- a/modules/flann/include/opencv2/flann/flann_base.hpp
+++ b/modules/flann/include/opencv2/flann/flann_base.hpp
@@ -82,11 +82,11 @@ NNIndex<Distance>* load_saved_index(const Matrix<typename Distance::ElementType>
     IndexHeader header = load_header(fin);
     if (header.data_type != Datatype<ElementType>::type()) {
         fclose(fin);
-        throw FLANNException("Datatype of saved index is different than of the one to be created.");
+        FLANN_THROW(cv::Error::StsError, "Datatype of saved index is different than of the one to be created.");
     }
     if ((size_t(header.rows) != dataset.rows)||(size_t(header.cols) != dataset.cols)) {
         fclose(fin);
-        throw FLANNException("The index saved belongs to a different dataset");
+        FLANN_THROW(cv::Error::StsError, "The index saved belongs to a different dataset");
     }
 
     IndexParams params;
@@ -140,7 +140,7 @@ public:
     {
         FILE* fout = fopen(filename.c_str(), "wb");
         if (fout == NULL) {
-            throw FLANNException("Cannot open file");
+            FLANN_THROW(cv::Error::StsError, "Cannot open file");
         }
         save_header(fout, *nnIndex_);
         saveIndex(fout);
diff --git a/modules/flann/include/opencv2/flann/general.h b/modules/flann/include/opencv2/flann/general.h
index ac848d6230..29fa8be121 100644
--- a/modules/flann/include/opencv2/flann/general.h
+++ b/modules/flann/include/opencv2/flann/general.h
@@ -31,6 +31,8 @@
 #ifndef OPENCV_FLANN_GENERAL_H_
 #define OPENCV_FLANN_GENERAL_H_
 
+#if CV_VERSION_MAJOR <= 4
+
 //! @cond IGNORED
 
 #include "opencv2/core.hpp"
@@ -48,6 +50,14 @@ public:
 
 }
 
+#define FLANN_THROW(TYPE, STR) throw FLANNException(STR)
+
+#else
+
+#define FLANN_THROW(TYPE, STR) CV_Error(TYPE, STR)
+
+#endif
+
 //! @endcond
 
 #endif  /* OPENCV_FLANN_GENERAL_H_ */
diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
index b7a650ff00..2d39d4f0f6 100644
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -382,7 +382,7 @@ public:
             chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
         }
         else {
-            throw FLANNException("Unknown algorithm for choosing initial centers.");
+            FLANN_THROW(cv::Error::StsError, "Unknown algorithm for choosing initial centers.");
         }
 
         root = new NodePtr[trees_];
@@ -446,7 +446,7 @@ public:
     void buildIndex() CV_OVERRIDE
     {
         if (branching_<2) {
-            throw FLANNException("Branching factor must be at least 2");
+            FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2");
         }
 
         free_indices();
diff --git a/modules/flann/include/opencv2/flann/index_testing.h b/modules/flann/include/opencv2/flann/index_testing.h
index f3d147588d..207adef449 100644
--- a/modules/flann/include/opencv2/flann/index_testing.h
+++ b/modules/flann/include/opencv2/flann/index_testing.h
@@ -93,7 +93,7 @@ float search_with_ground_truth(NNIndex<Distance>& index, const Matrix<typename D
     if (matches.cols<size_t(nn)) {
         Logger::info("matches.cols=%d, nn=%d\n",matches.cols,nn);
 
-        throw FLANNException("Ground truth is not computed for as many neighbors as requested");
+        FLANN_THROW(cv::Error::StsError, "Ground truth is not computed for as many neighbors as requested");
     }
 
     KNNResultSet<DistanceType> resultSet(nn+skipMatches);
diff --git a/modules/flann/include/opencv2/flann/kdtree_index.h b/modules/flann/include/opencv2/flann/kdtree_index.h
index 5a3d9d7fe0..603fdbd421 100644
--- a/modules/flann/include/opencv2/flann/kdtree_index.h
+++ b/modules/flann/include/opencv2/flann/kdtree_index.h
@@ -37,7 +37,6 @@
 #include <map>
 #include <cstring>
 
-#include "general.h"
 #include "nn_index.h"
 #include "dynamic_bitset.h"
 #include "matrix.h"
diff --git a/modules/flann/include/opencv2/flann/kdtree_single_index.h b/modules/flann/include/opencv2/flann/kdtree_single_index.h
index e571403b10..ed95c3db7d 100644
--- a/modules/flann/include/opencv2/flann/kdtree_single_index.h
+++ b/modules/flann/include/opencv2/flann/kdtree_single_index.h
@@ -37,7 +37,6 @@
 #include <map>
 #include <cstring>
 
-#include "general.h"
 #include "nn_index.h"
 #include "matrix.h"
 #include "result_set.h"
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index cb1a54a6d6..f73669999f 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -370,7 +370,7 @@ public:
             chooseCenters = &KMeansIndex::chooseCentersKMeanspp;
         }
         else {
-            throw FLANNException("Unknown algorithm for choosing initial centers.");
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown algorithm for choosing initial centers.");
         }
         cb_index_ = 0.4f;
 
@@ -442,7 +442,7 @@ public:
     void buildIndex() CV_OVERRIDE
     {
         if (branching_<2) {
-            throw FLANNException("Branching factor must be at least 2");
+            FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2");
         }
 
         free_indices();
@@ -559,7 +559,7 @@ public:
     {
         int numClusters = centers.rows;
         if (numClusters<1) {
-            throw FLANNException("Number of clusters must be at least 1");
+            FLANN_THROW(cv::Error::StsBadArg, "Number of clusters must be at least 1");
         }
 
         DistanceType variance;
diff --git a/modules/flann/include/opencv2/flann/linear_index.h b/modules/flann/include/opencv2/flann/linear_index.h
index 8a0f10fd86..6428c0d7ef 100644
--- a/modules/flann/include/opencv2/flann/linear_index.h
+++ b/modules/flann/include/opencv2/flann/linear_index.h
@@ -33,7 +33,6 @@
 
 //! @cond IGNORED
 
-#include "general.h"
 #include "nn_index.h"
 
 namespace cvflann
diff --git a/modules/flann/include/opencv2/flann/lsh_index.h b/modules/flann/include/opencv2/flann/lsh_index.h
index 4e3c376006..b5e87f6041 100644
--- a/modules/flann/include/opencv2/flann/lsh_index.h
+++ b/modules/flann/include/opencv2/flann/lsh_index.h
@@ -42,7 +42,6 @@
 #include <map>
 #include <vector>
 
-#include "general.h"
 #include "nn_index.h"
 #include "matrix.h"
 #include "result_set.h"
diff --git a/modules/flann/include/opencv2/flann/matrix.h b/modules/flann/include/opencv2/flann/matrix.h
index 34893b72c3..fb871bd73c 100644
--- a/modules/flann/include/opencv2/flann/matrix.h
+++ b/modules/flann/include/opencv2/flann/matrix.h
@@ -35,8 +35,6 @@
 
 #include <stdio.h>
 
-#include "general.h"
-
 namespace cvflann
 {
 
diff --git a/modules/flann/include/opencv2/flann/miniflann.hpp b/modules/flann/include/opencv2/flann/miniflann.hpp
index 093646254c..b8df92d758 100644
--- a/modules/flann/include/opencv2/flann/miniflann.hpp
+++ b/modules/flann/include/opencv2/flann/miniflann.hpp
@@ -169,10 +169,13 @@ public:
     CV_WRAP cvflann::flann_algorithm_t getAlgorithm() const;
 
 protected:
+    bool load_(const String& filename);
+
     cvflann::flann_distance_t distType;
     cvflann::flann_algorithm_t algo;
     int featureType;
     void* index;
+    Mat features_clone;  // index may store features pointer internally for searching, so avoid dangling pointers: https://github.com/opencv/opencv/issues/17553
 };
 
 } } // namespace cv::flann
diff --git a/modules/flann/include/opencv2/flann/nn_index.h b/modules/flann/include/opencv2/flann/nn_index.h
index fbb4c7924c..f6e17d19fc 100644
--- a/modules/flann/include/opencv2/flann/nn_index.h
+++ b/modules/flann/include/opencv2/flann/nn_index.h
@@ -31,7 +31,6 @@
 #ifndef OPENCV_FLANN_NNINDEX_H
 #define OPENCV_FLANN_NNINDEX_H
 
-#include "general.h"
 #include "matrix.h"
 #include "result_set.h"
 #include "params.h"
diff --git a/modules/flann/include/opencv2/flann/params.h b/modules/flann/include/opencv2/flann/params.h
index dd3092f065..c9093cde8c 100644
--- a/modules/flann/include/opencv2/flann/params.h
+++ b/modules/flann/include/opencv2/flann/params.h
@@ -91,7 +91,7 @@ T get_param(const IndexParams& params, cv::String name)
         return it->second.cast<T>();
     }
     else {
-        throw FLANNException(cv::String("Missing parameter '")+name+cv::String("' in the parameters given"));
+        FLANN_THROW(cv::Error::StsBadArg, cv::String("Missing parameter '")+name+cv::String("' in the parameters given"));
     }
 }
 
diff --git a/modules/flann/include/opencv2/flann/random.h b/modules/flann/include/opencv2/flann/random.h
index 3bb48b687b..2c1809c3a9 100644
--- a/modules/flann/include/opencv2/flann/random.h
+++ b/modules/flann/include/opencv2/flann/random.h
@@ -37,8 +37,6 @@
 #include <cstdlib>
 #include <vector>
 
-#include "general.h"
-
 namespace cvflann
 {
 
diff --git a/modules/flann/include/opencv2/flann/saving.h b/modules/flann/include/opencv2/flann/saving.h
index 53359b4b7b..8b3aeb7f0a 100644
--- a/modules/flann/include/opencv2/flann/saving.h
+++ b/modules/flann/include/opencv2/flann/saving.h
@@ -112,11 +112,11 @@ inline IndexHeader load_header(FILE* stream)
     size_t read_size = fread(&header,sizeof(header),1,stream);
 
     if (read_size!=(size_t)1) {
-        throw FLANNException("Invalid index file, cannot read");
+        FLANN_THROW(cv::Error::StsError, "Invalid index file, cannot read");
     }
 
     if (strcmp(header.signature,FLANN_SIGNATURE_)!=0) {
-        throw FLANNException("Invalid index file, wrong signature");
+        FLANN_THROW(cv::Error::StsError, "Invalid index file, wrong signature");
     }
 
     return header;
@@ -150,7 +150,7 @@ void load_value(FILE* stream, T& value, size_t count = 1)
 {
     size_t read_cnt = fread(&value, sizeof(value), count, stream);
     if (read_cnt != count) {
-        throw FLANNException("Cannot read from file");
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
     }
 }
 
@@ -159,12 +159,12 @@ void load_value(FILE* stream, cvflann::Matrix<T>& value)
 {
     size_t read_cnt = fread(&value, sizeof(value), 1, stream);
     if (read_cnt != 1) {
-        throw FLANNException("Cannot read from file");
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
     }
     value.data = new T[value.rows*value.cols];
     read_cnt = fread(value.data, sizeof(T), value.rows*value.cols, stream);
     if (read_cnt != (size_t)(value.rows*value.cols)) {
-        throw FLANNException("Cannot read from file");
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
     }
 }
 
@@ -175,12 +175,12 @@ void load_value(FILE* stream, std::vector<T>& value)
     size_t size;
     size_t read_cnt = fread(&size, sizeof(size_t), 1, stream);
     if (read_cnt!=1) {
-        throw FLANNException("Cannot read from file");
+        FLANN_THROW(cv::Error::StsError, "Cannot read from file");
     }
     value.resize(size);
     read_cnt = fread(&value[0], sizeof(T), size, stream);
     if (read_cnt != size) {
-        throw FLANNException("Cannot read from file");
+        FLANN_THROW(cv::Error::StsError, "Cannot read from file");
     }
 }
 
diff --git a/modules/flann/src/miniflann.cpp b/modules/flann/src/miniflann.cpp
index b56578c17f..c871875ae4 100644
--- a/modules/flann/src/miniflann.cpp
+++ b/modules/flann/src/miniflann.cpp
@@ -390,14 +390,18 @@ void Index::build(InputArray _data, const IndexParams& params, flann_distance_t
     CV_INSTRUMENT_REGION();
 
     release();
+
+    // Index may reuse 'data' during search, need to keep it alive
+    features_clone = _data.getMat().clone();
+    Mat data = features_clone;
+
     algo = getParam<flann_algorithm_t>(params, "algorithm", FLANN_INDEX_LINEAR);
     if( algo == FLANN_INDEX_SAVED )
     {
-        load(_data, getParam<String>(params, "filename", String()));
+        load_(getParam<String>(params, "filename", String()));
         return;
     }
 
-    Mat data = _data.getMat();
     index = 0;
     featureType = data.type();
     distType = _distType;
@@ -462,6 +466,8 @@ void Index::release()
 {
     CV_INSTRUMENT_REGION();
 
+    features_clone.release();
+
     if( !index )
         return;
 
@@ -785,9 +791,20 @@ bool loadIndex(Index* index0, void*& index, const Mat& data, FILE* fin, const Di
 
 bool Index::load(InputArray _data, const String& filename)
 {
-    Mat data = _data.getMat();
-    bool ok = true;
     release();
+
+    // Index may reuse 'data' during search, need to keep it alive
+    features_clone = _data.getMat().clone();
+    Mat data = features_clone;
+
+    return load_(filename);
+}
+
+bool Index::load_(const String& filename)
+{
+    Mat data = features_clone;
+    bool ok = true;
+
     FILE* fin = fopen(filename.c_str(), "rb");
     if (fin == NULL)
         return false;
diff --git a/modules/flann/src/precomp.hpp b/modules/flann/src/precomp.hpp
index 099a6abce1..66de0c1a9c 100644
--- a/modules/flann/src/precomp.hpp
+++ b/modules/flann/src/precomp.hpp
@@ -13,7 +13,6 @@
 #include "opencv2/flann/index_testing.h"
 #include "opencv2/flann/params.h"
 #include "opencv2/flann/saving.h"
-#include "opencv2/flann/general.h"
 
 // index types
 #include "opencv2/flann/all_indices.h"
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 0067cfa389..6b586c1f99 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -23,7 +23,7 @@ ocv_add_module(gapi
     REQUIRED
       opencv_imgproc
     OPTIONAL
-      opencv_video
+      opencv_video opencv_stereo
     WRAP
       python
 )
@@ -38,10 +38,6 @@ if(MSVC)
   endif()
 endif()
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")  # don't add Clang here: issue should be investigated and fixed (workaround for Apple only)
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wrange-loop-analysis)  # https://github.com/opencv/opencv/issues/18928
-endif()
-
 file(GLOB gapi_ext_hdrs
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/*.hpp"
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.hpp"
@@ -57,6 +53,7 @@ file(GLOB gapi_ext_hdrs
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/streaming/*.hpp"
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/plaidml/*.hpp"
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/util/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/python/*.hpp"
     )
 
 set(gapi_srcs
@@ -80,6 +77,7 @@ set(gapi_srcs
     src/api/kernels_video.cpp
     src/api/kernels_nnparsers.cpp
     src/api/kernels_streaming.cpp
+    src/api/kernels_stereo.cpp
     src/api/render.cpp
     src/api/render_ocv.cpp
     src/api/ginfer.cpp
@@ -107,6 +105,7 @@ set(gapi_srcs
 
     # Executor
     src/executor/gexecutor.cpp
+    src/executor/gtbbexecutor.cpp
     src/executor/gstreamingexecutor.cpp
     src/executor/gasync.cpp
 
@@ -114,6 +113,7 @@ set(gapi_srcs
     src/backends/cpu/gcpubackend.cpp
     src/backends/cpu/gcpukernel.cpp
     src/backends/cpu/gcpuimgproc.cpp
+    src/backends/cpu/gcpustereo.cpp
     src/backends/cpu/gcpuvideo.cpp
     src/backends/cpu/gcpucore.cpp
     src/backends/cpu/gnnparsers.cpp
@@ -156,8 +156,12 @@ set(gapi_srcs
     src/api/s11n.cpp
     src/backends/common/serialization.cpp
 
+    # Streaming backend
+    src/backends/streaming/gstreamingbackend.cpp
+
     # Python bridge
     src/backends/ie/bindings_ie.cpp
+    src/backends/python/gpythonbackend.cpp
     )
 
 ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
@@ -196,6 +200,10 @@ if(TARGET opencv_test_gapi)
   target_link_libraries(opencv_test_gapi PRIVATE ade)
 endif()
 
+if(HAVE_TBB AND TARGET opencv_test_gapi)
+  ocv_target_link_libraries(opencv_test_gapi PRIVATE tbb)
+endif()
+
 if(HAVE_FREETYPE)
   ocv_target_compile_definitions(${the_module} PRIVATE -DHAVE_FREETYPE)
   if(TARGET opencv_test_gapi)
diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index ee1b645412..aa24e949ea 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -20,12 +20,26 @@ endif()
 set(ADE_root "${ade_src_dir}/${ade_subdir}/sources/ade")
 file(GLOB_RECURSE ADE_sources "${ADE_root}/source/*.cpp")
 file(GLOB_RECURSE ADE_include "${ADE_root}/include/ade/*.hpp")
-add_library(ade STATIC ${ADE_include} ${ADE_sources})
+add_library(ade STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL}
+    ${ADE_include}
+    ${ADE_sources}
+)
 target_include_directories(ade PUBLIC $<BUILD_INTERFACE:${ADE_root}/include>)
-set_target_properties(ade PROPERTIES POSITION_INDEPENDENT_CODE True)
+set_target_properties(ade PROPERTIES
+  POSITION_INDEPENDENT_CODE True
+  OUTPUT_NAME ade
+  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+  COMPILE_PDB_NAME ade
+  COMPILE_PDB_NAME_DEBUG "ade${OPENCV_DEBUG_POSTFIX}"
+  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+)
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(ade PROPERTIES FOLDER "3rdparty")
+endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(ade EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  ocv_install_target(ade EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev OPTIONAL)
 endif()
 
 ocv_install_3rdparty_licenses(ade "${ade_src_dir}/${ade_subdir}/LICENSE")
diff --git a/modules/gapi/cmake/standalone.cmake b/modules/gapi/cmake/standalone.cmake
index 5cc57d8269..d08eda1be5 100644
--- a/modules/gapi/cmake/standalone.cmake
+++ b/modules/gapi/cmake/standalone.cmake
@@ -21,6 +21,7 @@ file(GLOB FLUID_sources  "${FLUID_ROOT}/src/api/g*.cpp"
                          "${FLUID_ROOT}/src/compiler/passes/*.cpp"
                          "${FLUID_ROOT}/src/executor/*.cpp"
                          "${FLUID_ROOT}/src/backends/fluid/*.cpp"
+                         "${FLUID_ROOT}/src/backends/streaming/*.cpp"
                          "${FLUID_ROOT}/src/backends/common/*.cpp")
 
 add_library(${FLUID_TARGET} STATIC ${FLUID_includes} ${FLUID_sources})
diff --git a/modules/gapi/include/opencv2/gapi.hpp b/modules/gapi/include/opencv2/gapi.hpp
index 8445746710..e4b2021479 100644
--- a/modules/gapi/include/opencv2/gapi.hpp
+++ b/modules/gapi/include/opencv2/gapi.hpp
@@ -33,8 +33,9 @@
 #include <opencv2/gapi/gkernel.hpp>
 #include <opencv2/gapi/operators.hpp>
 
-// Include this file here to avoid cyclic dependency between
+// Include these files here to avoid cyclic dependency between
 // Desync & GKernel & GComputation & GStreamingCompiled.
 #include <opencv2/gapi/streaming/desync.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
 
 #endif // OPENCV_GAPI_HPP
diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp
index 8825585696..cb5d55d13f 100644
--- a/modules/gapi/include/opencv2/gapi/core.hpp
+++ b/modules/gapi/include/opencv2/gapi/core.hpp
@@ -17,6 +17,7 @@
 #include <opencv2/gapi/gmat.hpp>
 #include <opencv2/gapi/gscalar.hpp>
 #include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
 
 /** \defgroup gapi_core G-API Core functionality
 @{
@@ -26,6 +27,7 @@
     @defgroup gapi_transform Graph API: Image and channel composition functions
 @}
  */
+
 namespace cv { namespace gapi {
 namespace core {
     using GMat2 = std::tuple<GMat,GMat>;
@@ -296,8 +298,8 @@ namespace core {
         }
     };
 
-    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat, GScalar)>, "org.opencv.core.matrixop.absdiffC") {
-        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat,GScalar)>, "org.opencv.core.matrixop.absdiffC") {
+        static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) {
             return a;
         }
     };
@@ -450,12 +452,6 @@ namespace core {
         }
     };
 
-    G_TYPED_KERNEL(GCopy, <GMat(GMat)>, "org.opencv.core.transform.copy") {
-        static GMatDesc outMeta(GMatDesc in) {
-            return in;
-        }
-    };
-
     G_TYPED_KERNEL(GConcatHor, <GMat(GMat, GMat)>, "org.opencv.imgproc.transform.concatHor") {
         static GMatDesc outMeta(GMatDesc l, GMatDesc r) {
             return l.withSizeDelta(+r.size.width, 0);
@@ -508,6 +504,77 @@ namespace core {
             return in.withType(in.depth, in.chan).withSize(dsize);
         }
     };
+
+    G_TYPED_KERNEL(
+        GKMeansND,
+        <std::tuple<GOpaque<double>,GMat,GMat>(GMat,int,GMat,TermCriteria,int,KmeansFlags)>,
+        "org.opencv.core.kmeansND") {
+
+        static std::tuple<GOpaqueDesc,GMatDesc,GMatDesc>
+        outMeta(const GMatDesc& in, int K, const GMatDesc& bestLabels, const TermCriteria&, int,
+                KmeansFlags flags) {
+            GAPI_Assert(in.depth == CV_32F);
+            std::vector<int> amount_n_dim = detail::checkVector(in);
+            int amount = amount_n_dim[0], dim = amount_n_dim[1];
+            if (amount == -1)   // Mat with height != 1, width != 1, channels != 1 given
+            {                   // which means that kmeans will consider the following:
+                amount = in.size.height;
+                dim    = in.size.width * in.chan;
+            }
+            // kmeans sets these labels' sizes when no bestLabels given:
+            GMatDesc out_labels(CV_32S, 1, Size{1, amount});
+            // kmeans always sets these centers' sizes:
+            GMatDesc centers   (CV_32F, 1, Size{dim, K});
+            if (flags & KMEANS_USE_INITIAL_LABELS)
+            {
+                GAPI_Assert(bestLabels.depth == CV_32S);
+                int labels_amount = detail::checkVector(bestLabels, 1u);
+                GAPI_Assert(labels_amount == amount);
+                out_labels = bestLabels;  // kmeans preserves bestLabels' sizes if given
+            }
+            return std::make_tuple(empty_gopaque_desc(), out_labels, centers);
+        }
+    };
+
+    G_TYPED_KERNEL(
+        GKMeansNDNoInit,
+        <std::tuple<GOpaque<double>,GMat,GMat>(GMat,int,TermCriteria,int,KmeansFlags)>,
+        "org.opencv.core.kmeansNDNoInit") {
+
+        static std::tuple<GOpaqueDesc,GMatDesc,GMatDesc>
+        outMeta(const GMatDesc& in, int K, const TermCriteria&, int, KmeansFlags flags) {
+            GAPI_Assert( !(flags & KMEANS_USE_INITIAL_LABELS) );
+            GAPI_Assert(in.depth == CV_32F);
+            std::vector<int> amount_n_dim = detail::checkVector(in);
+            int amount = amount_n_dim[0], dim = amount_n_dim[1];
+            if (amount == -1) // Mat with height != 1, width != 1, channels != 1 given
+            {                   // which means that kmeans will consider the following:
+                amount = in.size.height;
+                dim    = in.size.width * in.chan;
+            }
+            GMatDesc out_labels(CV_32S, 1, Size{1, amount});
+            GMatDesc centers   (CV_32F, 1, Size{dim, K});
+            return std::make_tuple(empty_gopaque_desc(), out_labels, centers);
+        }
+    };
+
+    G_TYPED_KERNEL(GKMeans2D, <std::tuple<GOpaque<double>,GArray<int>,GArray<Point2f>>
+                               (GArray<Point2f>,int,GArray<int>,TermCriteria,int,KmeansFlags)>,
+                   "org.opencv.core.kmeans2D") {
+        static std::tuple<GOpaqueDesc,GArrayDesc,GArrayDesc>
+        outMeta(const GArrayDesc&,int,const GArrayDesc&,const TermCriteria&,int,KmeansFlags) {
+            return std::make_tuple(empty_gopaque_desc(), empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GKMeans3D, <std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
+                               (GArray<Point3f>,int,GArray<int>,TermCriteria,int,KmeansFlags)>,
+                   "org.opencv.core.kmeans3D") {
+        static std::tuple<GOpaqueDesc,GArrayDesc,GArrayDesc>
+        outMeta(const GArrayDesc&,int,const GArrayDesc&,const TermCriteria&,int,KmeansFlags) {
+            return std::make_tuple(empty_gopaque_desc(), empty_array_desc(), empty_array_desc());
+        }
+    };
 } // namespace core
 
 namespace streaming {
@@ -524,6 +591,12 @@ G_TYPED_KERNEL(GSizeR, <GOpaque<Size>(GOpaque<Rect>)>, "org.opencv.streaming.siz
         return empty_gopaque_desc();
     }
 };
+
+G_TYPED_KERNEL(GSizeMF, <GOpaque<Size>(GFrame)>, "org.opencv.streaming.sizeMF") {
+    static GOpaqueDesc outMeta(const GFrameDesc&) {
+        return empty_gopaque_desc();
+    }
+};
 } // namespace streaming
 
 //! @addtogroup gapi_math
@@ -572,7 +645,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa sub, addWeighted
 */
-GAPI_EXPORTS GMat addC(const GMat& src1, const GScalar& c, int ddepth = -1);
+GAPI_EXPORTS_W GMat addC(const GMat& src1, const GScalar& c, int ddepth = -1);
 //! @overload
 GAPI_EXPORTS GMat addC(const GScalar& c, const GMat& src1, int ddepth = -1);
 
@@ -1081,6 +1154,7 @@ GAPI_EXPORTS GMat bitwise_xor(const GMat& src1, const GScalar& src2);
 
 
 /** @brief Inverts every bit of an array.
+
 The function bitwise_not calculates per-element bit-wise inversion of the input
 matrix:
 \f[\texttt{dst} (I) =  \neg \texttt{src} (I)\f]
@@ -1436,41 +1510,77 @@ Output image size will have the size dsize, the depth of output is the same as o
  */
 GAPI_EXPORTS GMatP resizeP(const GMatP& src, const Size& dsize, int interpolation = cv::INTER_LINEAR);
 
-/** @brief Creates one 3-channel (4-channel) matrix out of 3(4) single-channel ones.
+/** @brief Creates one 4-channel matrix out of 4 single-channel ones.
 
 The function merges several matrices to make a single multi-channel matrix. That is, each
 element of the output matrix will be a concatenation of the elements of the input matrices, where
 elements of i-th input matrix are treated as mv[i].channels()-element vectors.
-Input matrix must be of @ref CV_8UC3 (@ref CV_8UC4) type.
+Output matrix must be of @ref CV_8UC4 type.
 
-The function split3/split4 does the reverse operation.
+The function split4 does the reverse operation.
 
-@note Function textual ID for merge3 is "org.opencv.core.transform.merge3"
-@note Function textual ID for merge4 is "org.opencv.core.transform.merge4"
+@note
+ - Function textual ID is "org.opencv.core.transform.merge4"
 
-@param src1 first input matrix to be merged
-@param src2 second input matrix to be merged
-@param src3 third input matrix to be merged
-@param src4 fourth input matrix to be merged
-@sa  split4, split3
+@param src1 first input @ref CV_8UC1 matrix to be merged.
+@param src2 second input @ref CV_8UC1 matrix to be merged.
+@param src3 third input @ref CV_8UC1 matrix to be merged.
+@param src4 fourth input @ref CV_8UC1 matrix to be merged.
+@sa merge3, split4, split3
 */
 GAPI_EXPORTS GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4);
+
+/** @brief Creates one 3-channel matrix out of 3 single-channel ones.
+
+The function merges several matrices to make a single multi-channel matrix. That is, each
+element of the output matrix will be a concatenation of the elements of the input matrices, where
+elements of i-th input matrix are treated as mv[i].channels()-element vectors.
+Output matrix must be of @ref CV_8UC3 type.
+
+The function split3 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.merge3"
+
+@param src1 first input @ref CV_8UC1 matrix to be merged.
+@param src2 second input @ref CV_8UC1 matrix to be merged.
+@param src3 third input @ref CV_8UC1 matrix to be merged.
+@sa merge4, split4, split3
+*/
 GAPI_EXPORTS GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3);
 
-/** @brief Divides a 3-channel (4-channel) matrix into 3(4) single-channel matrices.
+/** @brief Divides a 4-channel matrix into 4 single-channel matrices.
 
-The function splits a 3-channel (4-channel) matrix into 3(4) single-channel matrices:
+The function splits a 4-channel matrix into 4 single-channel matrices:
 \f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
 
-All output matrices must be in @ref CV_8UC1.
+All output matrices must be of @ref CV_8UC1 type.
 
-@note Function textual for split3 ID is "org.opencv.core.transform.split3"
-@note Function textual for split4 ID is "org.opencv.core.transform.split4"
+The function merge4 does the reverse operation.
 
-@param src input @ref CV_8UC4 (@ref CV_8UC3) matrix.
-@sa merge3, merge4
+@note
+ - Function textual ID is "org.opencv.core.transform.split4"
+
+@param src input @ref CV_8UC4 matrix.
+@sa split3, merge3, merge4
 */
 GAPI_EXPORTS std::tuple<GMat, GMat, GMat,GMat> split4(const GMat& src);
+
+/** @brief Divides a 3-channel matrix into 3 single-channel matrices.
+
+The function splits a 3-channel matrix into 3 single-channel matrices:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+
+All output matrices must be of @ref CV_8UC1 type.
+
+The function merge3 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.split3"
+
+@param src input @ref CV_8UC3 matrix.
+@sa split4, merge3, merge4
+*/
 GAPI_EXPORTS_W std::tuple<GMat, GMat, GMat> split3(const GMat& src);
 
 /** @brief Applies a generic geometrical transformation to an image.
@@ -1488,7 +1598,9 @@ convert from floating to fixed-point representations of a map is that they can y
 cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
 Output image must be of the same size and depth as input one.
 
-@note Function textual ID is "org.opencv.core.transform.remap"
+@note
+ - Function textual ID is "org.opencv.core.transform.remap"
+ - Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
 
 @param src Source image.
 @param map1 The first map of either (x,y) points or just x values having the type CV_16SC2,
@@ -1501,8 +1613,6 @@ and #INTER_LINEAR_EXACT are not supported by this function.
 borderMode=BORDER_TRANSPARENT, it means that the pixels in the destination image that
 corresponds to the "outliers" in the source image are not modified by the function.
 @param borderValue Value used in case of a constant border. By default, it is 0.
-@note
-Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
  */
 GAPI_EXPORTS GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
                       int interpolation, int borderMode = BORDER_CONSTANT,
@@ -1559,19 +1669,6 @@ Output matrix must be of the same depth as input one, size is specified by given
 */
 GAPI_EXPORTS GMat crop(const GMat& src, const Rect& rect);
 
-/** @brief Copies a matrix.
-
-Copies an input array. Works as a regular Mat::clone but happens in-graph.
-Mainly is used to workaround some existing limitations (e.g. to forward an input frame to outputs
-in the streaming mode). Will be deprecated and removed in the future.
-
-@note Function textual ID is "org.opencv.core.transform.copy"
-
-@param src input matrix.
-@sa crop
-*/
-GAPI_EXPORTS GMat copy(const GMat& src);
-
 /** @brief Applies horizontal concatenation to given matrices.
 
 The function horizontally concatenates two GMat matrices (with the same number of rows).
@@ -1757,6 +1854,79 @@ GAPI_EXPORTS GMat warpAffine(const GMat& src, const Mat& M, const Size& dsize, i
                              int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
 //! @} gapi_transform
 
+/** @brief Finds centers of clusters and groups input samples around the clusters.
+
+The function kmeans implements a k-means algorithm that finds the centers of K clusters
+and groups the input samples around the clusters. As an output, \f$\texttt{bestLabels}_i\f$
+contains a 0-based cluster index for the \f$i^{th}\f$ sample.
+
+@note
+ - Function textual ID is "org.opencv.core.kmeansND"
+ - In case of an N-dimentional points' set given, input GMat can have the following traits:
+2 dimensions, a single row or column if there are N channels,
+or N columns if there is a single channel. Mat should have @ref CV_32F depth.
+ - Although, if GMat with height != 1, width != 1, channels != 1 given as data, n-dimensional
+samples are considered given in amount of A, where A = height, n = width * channels.
+ - In case of GMat given as data:
+     - the output labels are returned as 1-channel GMat with sizes
+width = 1, height = A, where A is samples amount, or width = bestLabels.width,
+height = bestLabels.height if bestLabels given;
+     - the cluster centers are returned as 1-channel GMat with sizes
+width = n, height = K, where n is samples' dimentionality and K is clusters' amount.
+ - As one of possible usages, if you want to control the initial labels for each attempt
+by yourself, you can utilize just the core of the function. To do that, set the number
+of attempts to 1, initialize labels each time using a custom algorithm, pass them with the
+( flags = #KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best (most-compact) clustering.
+
+@param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
+Function can take GArray<Point2f>, GArray<Point3f> for 2D and 3D cases or GMat for any
+dimentionality and channels.
+@param K Number of clusters to split the set by.
+@param bestLabels Optional input integer array that can store the supposed initial cluster indices
+for every sample. Used when ( flags = #KMEANS_USE_INITIAL_LABELS ) flag is set.
+@param criteria The algorithm termination criteria, that is, the maximum number of iterations
+and/or the desired accuracy. The accuracy is specified as criteria.epsilon. As soon as each of
+the cluster centers moves by less than criteria.epsilon on some iteration, the algorithm stops.
+@param attempts Flag to specify the number of times the algorithm is executed using different
+initial labellings. The algorithm returns the labels that yield the best compactness (see the first
+function return value).
+@param flags Flag that can take values of cv::KmeansFlags .
+
+@return
+ - Compactness measure that is computed as
+\f[\sum _i  \| \texttt{samples} _i -  \texttt{centers} _{ \texttt{labels} _i} \| ^2\f]
+after every attempt. The best (minimum) value is chosen and the corresponding labels and the
+compactness value are returned by the function.
+ - Integer array that stores the cluster indices for every sample.
+ - Array of the cluster centers.
+*/
+GAPI_EXPORTS std::tuple<GOpaque<double>,GMat,GMat>
+kmeans(const GMat& data, const int K, const GMat& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
+/** @overload
+@note
+ - Function textual ID is "org.opencv.core.kmeansNDNoInit"
+ - #KMEANS_USE_INITIAL_LABELS flag must not be set while using this overload.
+ */
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GMat,GMat>
+kmeans(const GMat& data, const int K, const TermCriteria& criteria, const int attempts,
+       const KmeansFlags flags);
+
+/** @overload
+@note Function textual ID is "org.opencv.core.kmeans2D"
+ */
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GArray<int>,GArray<Point2f>>
+kmeans(const GArray<Point2f>& data, const int K, const GArray<int>& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
+/** @overload
+@note Function textual ID is "org.opencv.core.kmeans3D"
+ */
+GAPI_EXPORTS std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
+kmeans(const GArray<Point3f>& data, const int K, const GArray<int>& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
 namespace streaming {
 /** @brief Gets dimensions from Mat.
 
@@ -1765,7 +1935,7 @@ namespace streaming {
 @param src Input tensor
 @return Size (tensor dimensions).
 */
-GAPI_EXPORTS GOpaque<Size> size(const GMat& src);
+GAPI_EXPORTS_W GOpaque<Size> size(const GMat& src);
 
 /** @overload
 Gets dimensions from rectangle.
@@ -1775,7 +1945,16 @@ Gets dimensions from rectangle.
 @param r Input rectangle.
 @return Size (rectangle dimensions).
 */
-GAPI_EXPORTS GOpaque<Size> size(const GOpaque<Rect>& r);
+GAPI_EXPORTS_W GOpaque<Size> size(const GOpaque<Rect>& r);
+
+/** @brief Gets dimensions from MediaFrame.
+
+@note Function textual ID is "org.opencv.streaming.sizeMF"
+
+@param src Input frame
+@return Size (frame dimensions).
+*/
+GAPI_EXPORTS GOpaque<Size> size(const GFrame& src);
 } //namespace streaming
 } //namespace gapi
 } //namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp b/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp
index 5dd70bd2e8..5539e244ba 100644
--- a/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp
@@ -101,6 +101,7 @@ public:
 
     const cv::Scalar& inVal(int input);
     cv::Scalar& outValR(int output); // FIXME: Avoid cv::Scalar s = ctx.outValR()
+    cv::MediaFrame& outFrame(int output);
     template<typename T> std::vector<T>& outVecR(int output) // FIXME: the same issue
     {
         return outVecRef(output).wref<T>();
@@ -189,6 +190,11 @@ template<> struct get_in<cv::GArray<cv::GScalar> >: public get_in<cv::GArray<cv:
 {
 };
 
+// FIXME(dm): GArray<vector<U>>/GArray<GArray<U>> conversion should be done more gracefully in the system
+template<typename U> struct get_in<cv::GArray<cv::GArray<U>> >: public get_in<cv::GArray<std::vector<U>> >
+{
+};
+
 //FIXME(dm): GOpaque<Mat>/GOpaque<GMat> conversion should be done more gracefully in the system
 template<> struct get_in<cv::GOpaque<cv::GMat> >: public get_in<cv::GOpaque<cv::Mat> >
 {
@@ -258,6 +264,13 @@ template<> struct get_out<cv::GScalar>
         return ctx.outValR(idx);
     }
 };
+template<> struct get_out<cv::GFrame>
+{
+    static cv::MediaFrame& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outFrame(idx);
+    }
+};
 template<typename U> struct get_out<cv::GArray<U>>
 {
     static std::vector<U>& get(GCPUContext &ctx, int idx)
diff --git a/modules/gapi/include/opencv2/gapi/cpu/stereo.hpp b/modules/gapi/include/opencv2/gapi/cpu/stereo.hpp
new file mode 100644
index 0000000000..f7d79e9b3c
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/cpu/stereo.hpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_CPU_STEREO_API_HPP
+#define OPENCV_GAPI_CPU_STEREO_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace calib3d {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+/** @brief Structure for the Stereo operation initialization parameters.*/
+struct GAPI_EXPORTS StereoInitParam {
+    StereoInitParam(int nD, int bS, double bL, double f):
+        numDisparities(nD), blockSize(bS), baseline(bL), focus(f) {}
+
+    StereoInitParam() = default;
+
+    int numDisparities = 0;
+    int blockSize = 21;
+    double baseline = 70.;
+    double focus = 1000.;
+};
+
+} // namespace cpu
+} // namespace calib3d
+} // namespace gapi
+
+namespace detail {
+
+    template<> struct CompileArgTag<cv::gapi::calib3d::cpu::StereoInitParam> {
+    static const char* tag() {
+        return "org.opencv.stereoInit";
+    }
+};
+
+} // namespace detail
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_STEREO_API_HPP
diff --git a/modules/gapi/include/opencv2/gapi/garg.hpp b/modules/gapi/include/opencv2/gapi/garg.hpp
index 0838573b56..20f2233bf9 100644
--- a/modules/gapi/include/opencv2/gapi/garg.hpp
+++ b/modules/gapi/include/opencv2/gapi/garg.hpp
@@ -210,6 +210,7 @@ using GRunArgP = util::variant<
     cv::Mat*,
     cv::RMat*,
     cv::Scalar*,
+    cv::MediaFrame*,
     cv::detail::VectorRef,
     cv::detail::OpaqueRef
     >;
@@ -248,6 +249,30 @@ template<typename... Ts> inline GRunArgsP gout(Ts&... args)
     return GRunArgsP{ GRunArgP(detail::wrap_host_helper<Ts>::wrap_out(args))... };
 }
 
+struct GTypeInfo;
+using GTypesInfo = std::vector<GTypeInfo>;
+
+// FIXME: Needed for python bridge, must be moved to more appropriate header
+namespace detail {
+struct ExtractArgsCallback
+{
+    cv::GRunArgs operator()(const cv::GTypesInfo& info) const { return c(info); }
+    using CallBackT = std::function<cv::GRunArgs(const cv::GTypesInfo& info)>;
+    CallBackT c;
+};
+
+struct ExtractMetaCallback
+{
+    cv::GMetaArgs operator()(const cv::GTypesInfo& info) const { return c(info); }
+    using CallBackT = std::function<cv::GMetaArgs(const cv::GTypesInfo& info)>;
+    CallBackT c;
+};
+
+void constructGraphOutputs(const cv::GTypesInfo &out_info,
+                           cv::GRunArgs         &args,
+                           cv::GRunArgsP        &outs);
+} // namespace detail
+
 } // namespace cv
 
 #endif // OPENCV_GAPI_GARG_HPP
diff --git a/modules/gapi/include/opencv2/gapi/garray.hpp b/modules/gapi/include/opencv2/gapi/garray.hpp
index 5d4b3c59e0..32799bc07e 100644
--- a/modules/gapi/include/opencv2/gapi/garray.hpp
+++ b/modules/gapi/include/opencv2/gapi/garray.hpp
@@ -35,14 +35,14 @@ template<typename T> class GArray;
  * \addtogroup gapi_meta_args
  * @{
  */
-struct GArrayDesc
+struct GAPI_EXPORTS_W_SIMPLE GArrayDesc
 {
     // FIXME: Body
     // FIXME: Also implement proper operator== then
     bool operator== (const GArrayDesc&) const { return true; }
 };
 template<typename U> GArrayDesc descr_of(const std::vector<U> &) { return {};}
-static inline GArrayDesc empty_array_desc() {return {}; }
+GAPI_EXPORTS_W inline GArrayDesc empty_array_desc() {return {}; }
 /** @} */
 
 std::ostream& operator<<(std::ostream& os, const cv::GArrayDesc &desc);
@@ -246,12 +246,18 @@ namespace detail
 
     public:
         VectorRef() = default;
-        template<typename T> explicit VectorRef(const std::vector<T>& vec) :
-                                            m_ref(new VectorRefT<T>(vec)), m_kind(GOpaqueTraits<T>::kind) {}
-        template<typename T> explicit VectorRef(std::vector<T>& vec)       :
-                                            m_ref(new VectorRefT<T>(vec)), m_kind(GOpaqueTraits<T>::kind) {}
-        template<typename T> explicit VectorRef(std::vector<T>&& vec)      :
-                                            m_ref(new VectorRefT<T>(std::move(vec))), m_kind(GOpaqueTraits<T>::kind) {}
+        template<typename T> explicit VectorRef(const std::vector<T>& vec)
+            : m_ref(new VectorRefT<T>(vec))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
+        template<typename T> explicit VectorRef(std::vector<T>& vec)
+            : m_ref(new VectorRefT<T>(vec))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
+        template<typename T> explicit VectorRef(std::vector<T>&& vec)
+            : m_ref(new VectorRefT<T>(std::move(vec)))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
 
         cv::detail::OpaqueKind getKind() const
         {
@@ -321,9 +327,10 @@ namespace detail
 #  define FLATTEN_NS cv
 #endif
     template<class T> struct flatten_g;
-    template<> struct flatten_g<cv::GMat>    { using type = FLATTEN_NS::Mat; };
-    template<> struct flatten_g<cv::GScalar> { using type = FLATTEN_NS::Scalar; };
-    template<class T> struct flatten_g       { using type = T; };
+    template<> struct flatten_g<cv::GMat>         { using type = FLATTEN_NS::Mat; };
+    template<> struct flatten_g<cv::GScalar>      { using type = FLATTEN_NS::Scalar; };
+    template<class T> struct flatten_g<GArray<T>> { using type = std::vector<T>; };
+    template<class T> struct flatten_g            { using type = T; };
 #undef FLATTEN_NS
     // FIXME: the above mainly duplicates "ProtoToParam" thing from gtyped.hpp
     // but I decided not to include gtyped here - probably worth moving that stuff
@@ -368,8 +375,6 @@ private:
     detail::GArrayU m_ref;
 };
 
-using GArrayP2f = GArray<cv::Point2f>;
-
 /** @} */
 
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/gcall.hpp b/modules/gapi/include/opencv2/gapi/gcall.hpp
index 511eca1408..8d1b8d6010 100644
--- a/modules/gapi/include/opencv2/gapi/gcall.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcall.hpp
@@ -11,6 +11,7 @@
 #include <opencv2/gapi/garg.hpp>      // GArg
 #include <opencv2/gapi/gmat.hpp>      // GMat
 #include <opencv2/gapi/gscalar.hpp>   // GScalar
+#include <opencv2/gapi/gframe.hpp>    // GFrame
 #include <opencv2/gapi/garray.hpp>    // GArray<T>
 #include <opencv2/gapi/gopaque.hpp>   // GOpaque<T>
 
@@ -41,6 +42,7 @@ public:
     GMat    yield      (int output = 0);
     GMatP   yieldP     (int output = 0);
     GScalar yieldScalar(int output = 0);
+    GFrame  yieldFrame (int output = 0);
 
     template<class T> GArray<T> yieldArray(int output = 0)
     {
diff --git a/modules/gapi/include/opencv2/gapi/gcommon.hpp b/modules/gapi/include/opencv2/gapi/gcommon.hpp
index a474140baa..8119e397eb 100644
--- a/modules/gapi/include/opencv2/gapi/gcommon.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcommon.hpp
@@ -204,12 +204,12 @@ template<typename... Ts> GCompileArgs compile_args(Ts&&... args)
     return GCompileArgs{ GCompileArg(args)... };
 }
 
+namespace gapi
+{
 /**
  * @brief Retrieves particular compilation argument by its type from
  *        cv::GCompileArgs
  */
-namespace gapi
-{
 template<typename T>
 inline cv::util::optional<T> getCompileArg(const cv::GCompileArgs &args)
 {
diff --git a/modules/gapi/include/opencv2/gapi/gcomputation.hpp b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
index 8732ada0d6..a3566fb495 100644
--- a/modules/gapi/include/opencv2/gapi/gcomputation.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
@@ -258,7 +258,8 @@ public:
     void apply(GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args = {});       // Arg-to-arg overload
 
     /// @private -- Exclude this function from OpenCV documentation
-    GAPI_WRAP GRunArgs apply(GRunArgs &&ins, GCompileArgs &&args = {});
+    GAPI_WRAP GRunArgs apply(const cv::detail::ExtractArgsCallback  &callback,
+                                   GCompileArgs                    &&args = {});
 
     /// @private -- Exclude this function from OpenCV documentation
     void apply(const std::vector<cv::Mat>& ins,                                   // Compatibility overload
@@ -436,7 +437,11 @@ public:
      *
      * @sa @ref gapi_compile_args
      */
-    GAPI_WRAP GStreamingCompiled compileStreaming(GMetaArgs &&in_metas, GCompileArgs &&args = {});
+    GStreamingCompiled compileStreaming(GMetaArgs &&in_metas, GCompileArgs &&args = {});
+
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP GStreamingCompiled compileStreaming(const cv::detail::ExtractMetaCallback &callback,
+                                                        GCompileArgs                   &&args = {});
 
     /**
      * @brief Compile the computation for streaming mode.
diff --git a/modules/gapi/include/opencv2/gapi/gframe.hpp b/modules/gapi/include/opencv2/gapi/gframe.hpp
index f555a93aa3..13fd5d6d29 100644
--- a/modules/gapi/include/opencv2/gapi/gframe.hpp
+++ b/modules/gapi/include/opencv2/gapi/gframe.hpp
@@ -62,6 +62,9 @@ struct GAPI_EXPORTS GFrameDesc
 static inline GFrameDesc empty_gframe_desc() { return GFrameDesc{}; }
 /** @} */
 
+class MediaFrame;
+GAPI_EXPORTS GFrameDesc descr_of(const MediaFrame &frame);
+
 GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const cv::GFrameDesc &desc);
 
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/gkernel.hpp b/modules/gapi/include/opencv2/gapi/gkernel.hpp
index 0ec7dd07c0..f70e50253d 100644
--- a/modules/gapi/include/opencv2/gapi/gkernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/gkernel.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_GKERNEL_HPP
@@ -30,6 +30,7 @@ struct GTypeInfo
 {
     GShape                 shape;
     cv::detail::OpaqueKind kind;
+    detail::HostCtor       ctor;
 };
 
 using GShapes    = std::vector<GShape>;
@@ -90,6 +91,10 @@ namespace detail
     {
         static inline cv::GOpaque<U> yield(cv::GCall &call, int i) { return call.yieldOpaque<U>(i); }
     };
+    template<> struct Yield<GFrame>
+    {
+        static inline cv::GFrame yield(cv::GCall &call, int i) { return call.yieldFrame(i); }
+    };
 
     ////////////////////////////////////////////////////////////////////////////
     // Helper classes which brings outputMeta() marshalling to kernel
@@ -239,8 +244,6 @@ public:
     using InArgs  = std::tuple<Args...>;
     using OutArgs = std::tuple<R>;
 
-    static_assert(!cv::detail::contains<GFrame, OutArgs>::value, "Values of GFrame type can't be used as operation outputs");
-
     static R on(Args... args)
     {
         cv::GCall call(GKernel{ K::id()
@@ -514,6 +517,13 @@ namespace gapi {
          */
         const std::vector<GTransform>& get_transformations() const;
 
+        /**
+         * @brief Returns vector of kernel ids included in the package
+         *
+         * @return vector of kernel ids included in the package
+         */
+        std::vector<std::string> get_kernel_ids() const;
+
         /**
          * @brief Test if a particular kernel _implementation_ KImpl is
          * included in this kernel package.
@@ -603,6 +613,18 @@ namespace gapi {
             includeHelper<KImpl>();
         }
 
+        /**
+         * @brief Adds a new kernel based on it's backend and id into the kernel package
+         *
+         * @param backend backend associated with the kernel
+         * @param kernel_id a name/id of the kernel
+         */
+        void include(const cv::gapi::GBackend& backend, const std::string& kernel_id)
+        {
+            removeAPI(kernel_id);
+            m_id_kernels[kernel_id] = std::make_pair(backend, GKernelImpl{{}, {}});
+        }
+
         /**
          * @brief Lists all backends which are included into package
          *
diff --git a/modules/gapi/include/opencv2/gapi/gmat.hpp b/modules/gapi/include/opencv2/gapi/gmat.hpp
index f441413be5..5e567fb107 100644
--- a/modules/gapi/include/opencv2/gapi/gmat.hpp
+++ b/modules/gapi/include/opencv2/gapi/gmat.hpp
@@ -73,25 +73,25 @@ class RMat;
  * \addtogroup gapi_meta_args
  * @{
  */
-struct GAPI_EXPORTS GMatDesc
+struct GAPI_EXPORTS_W_SIMPLE GMatDesc
 {
     // FIXME: Default initializers in C++14
-    int depth;
-    int chan;
-    cv::Size size; // NB.: no multi-dimensional cases covered yet
-    bool planar;
-    std::vector<int> dims; // FIXME: Maybe it's real questionable to have it here
+    GAPI_PROP int depth;
+    GAPI_PROP int chan;
+    GAPI_PROP cv::Size size; // NB.: no multi-dimensional cases covered yet
+    GAPI_PROP bool planar;
+    GAPI_PROP std::vector<int> dims; // FIXME: Maybe it's real questionable to have it here
 
-    GMatDesc(int d, int c, cv::Size s, bool p = false)
+    GAPI_WRAP GMatDesc(int d, int c, cv::Size s, bool p = false)
         : depth(d), chan(c), size(s), planar(p) {}
 
-    GMatDesc(int d, const std::vector<int> &dd)
+    GAPI_WRAP GMatDesc(int d, const std::vector<int> &dd)
         : depth(d), chan(-1), size{-1,-1}, planar(false), dims(dd) {}
 
-    GMatDesc(int d, std::vector<int> &&dd)
+    GAPI_WRAP GMatDesc(int d, std::vector<int> &&dd)
         : depth(d), chan(-1), size{-1,-1}, planar(false), dims(std::move(dd)) {}
 
-    GMatDesc() : GMatDesc(-1, -1, {-1,-1}) {}
+    GAPI_WRAP GMatDesc() : GMatDesc(-1, -1, {-1,-1}) {}
 
     inline bool operator== (const GMatDesc &rhs) const
     {
@@ -155,7 +155,7 @@ struct GAPI_EXPORTS GMatDesc
     // Meta combinator: return a new GMatDesc with specified data depth
     // and number of channels.
     // (all other fields are taken unchanged from this GMatDesc)
-    GMatDesc withType(int ddepth, int dchan) const
+    GAPI_WRAP GMatDesc withType(int ddepth, int dchan) const
     {
         GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
         GMatDesc desc = withDepth(ddepth);
@@ -203,6 +203,27 @@ struct GAPI_EXPORTS GMatDesc
 
 static inline GMatDesc empty_gmat_desc() { return GMatDesc{-1,-1,{-1,-1}}; }
 
+namespace gapi { namespace detail {
+/** Checks GMatDesc fields if the passed matrix is a set of n-dimentional points.
+@param in GMatDesc to check.
+@param n expected dimensionality.
+@return the amount of points. In case input matrix can't be described as vector of points
+of expected dimensionality, returns -1.
+ */
+int checkVector(const GMatDesc& in, const size_t n);
+
+/** @overload
+
+Checks GMatDesc fields if the passed matrix can be described as a set of points of any
+dimensionality.
+
+@return array of two elements in form of std::vector<int>: the amount of points
+and their calculated dimensionality. In case input matrix can't be described as vector of points,
+returns {-1, -1}.
+ */
+std::vector<int> checkVector(const GMatDesc& in);
+}} // namespace gapi::detail
+
 #if !defined(GAPI_STANDALONE)
 GAPI_EXPORTS GMatDesc descr_of(const cv::UMat &mat);
 #endif // !defined(GAPI_STANDALONE)
diff --git a/modules/gapi/include/opencv2/gapi/gopaque.hpp b/modules/gapi/include/opencv2/gapi/gopaque.hpp
index 6117971768..00f0718422 100644
--- a/modules/gapi/include/opencv2/gapi/gopaque.hpp
+++ b/modules/gapi/include/opencv2/gapi/gopaque.hpp
@@ -21,6 +21,9 @@
 #include <opencv2/gapi/util/type_traits.hpp>
 #include <opencv2/gapi/own/assert.hpp>
 
+#include <opencv2/gapi/gcommon.hpp>  // OpaqueKind
+#include <opencv2/gapi/garray.hpp>  // TypeHintBase
+
 namespace cv
 {
 // Forward declaration; GNode and GOrigin are an internal
@@ -33,14 +36,14 @@ template<typename T> class GOpaque;
  * \addtogroup gapi_meta_args
  * @{
  */
-struct GOpaqueDesc
+struct GAPI_EXPORTS_W_SIMPLE GOpaqueDesc
 {
     // FIXME: Body
     // FIXME: Also implement proper operator== then
     bool operator== (const GOpaqueDesc&) const { return true; }
 };
 template<typename U> GOpaqueDesc descr_of(const U &) { return {};}
-static inline GOpaqueDesc empty_gopaque_desc() {return {}; }
+GAPI_EXPORTS_W inline GOpaqueDesc empty_gopaque_desc() {return {}; }
 /** @} */
 
 std::ostream& operator<<(std::ostream& os, const cv::GOpaqueDesc &desc);
diff --git a/modules/gapi/include/opencv2/gapi/gproto.hpp b/modules/gapi/include/opencv2/gapi/gproto.hpp
index f91fcdb2c8..fbcccb38ea 100644
--- a/modules/gapi/include/opencv2/gapi/gproto.hpp
+++ b/modules/gapi/include/opencv2/gapi/gproto.hpp
@@ -135,7 +135,7 @@ GRunArg value_of(const GOrigin &origin);
 // Transform run-time computation arguments into a collection of metadata
 // extracted from that arguments
 GMetaArg  GAPI_EXPORTS descr_of(const GRunArg  &arg );
-GMetaArgs GAPI_EXPORTS_W descr_of(const GRunArgs &args);
+GMetaArgs GAPI_EXPORTS descr_of(const GRunArgs &args);
 
 // Transform run-time operation result argument into metadata extracted from that argument
 // Used to compare the metadata, which generated at compile time with the metadata result operation in run time
diff --git a/modules/gapi/include/opencv2/gapi/gscalar.hpp b/modules/gapi/include/opencv2/gapi/gscalar.hpp
index 00abdd1d13..d4af2cab5d 100644
--- a/modules/gapi/include/opencv2/gapi/gscalar.hpp
+++ b/modules/gapi/include/opencv2/gapi/gscalar.hpp
@@ -49,7 +49,7 @@ private:
  * \addtogroup gapi_meta_args
  * @{
  */
-struct GScalarDesc
+struct GAPI_EXPORTS_W_SIMPLE GScalarDesc
 {
     // NB.: right now it is empty
 
@@ -64,9 +64,9 @@ struct GScalarDesc
     }
 };
 
-static inline GScalarDesc empty_scalar_desc() { return GScalarDesc(); }
+GAPI_EXPORTS_W inline GScalarDesc empty_scalar_desc() { return GScalarDesc(); }
 
-GAPI_EXPORTS GScalarDesc descr_of(const cv::Scalar            &scalar);
+GAPI_EXPORTS GScalarDesc descr_of(const cv::Scalar &scalar);
 
 std::ostream& operator<<(std::ostream& os, const cv::GScalarDesc &desc);
 
diff --git a/modules/gapi/include/opencv2/gapi/gstreaming.hpp b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
index e09cf8d0f7..4e579caafb 100644
--- a/modules/gapi/include/opencv2/gapi/gstreaming.hpp
+++ b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
@@ -180,7 +180,10 @@ public:
      * @param ins vector of inputs to process.
      * @sa gin
      */
-    GAPI_WRAP void setSource(GRunArgs &&ins);
+    void setSource(GRunArgs &&ins);
+
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP void setSource(const cv::detail::ExtractArgsCallback& callback);
 
     /**
      * @brief Specify an input video stream for a single-input
@@ -251,6 +254,7 @@ public:
     bool pull(cv::GRunArgsP &&outs);
 
     // NB: Used from python
+    /// @private -- Exclude this function from OpenCV documentation
     GAPI_WRAP std::tuple<bool, cv::GRunArgs> pull();
 
     /**
diff --git a/modules/gapi/include/opencv2/gapi/imgproc.hpp b/modules/gapi/include/opencv2/gapi/imgproc.hpp
index 7435ec1e1d..25a64a5067 100644
--- a/modules/gapi/include/opencv2/gapi/imgproc.hpp
+++ b/modules/gapi/include/opencv2/gapi/imgproc.hpp
@@ -43,15 +43,6 @@ void validateFindingContoursMeta(const int depth, const int chan, const int mode
         break;
     }
 }
-
-// Checks if the passed mat is a set of n-dimentional points of the given depth
-bool isPointsVector(const int chan, const cv::Size &size, const int depth,
-                    const int n, const int ddepth = -1)
-{
-    return (ddepth == depth || ddepth < 0) &&
-           ((chan == n && (size.height == 1 || size.width == 1)) ||
-            (chan == 1 && size.width == n));
-}
 } // anonymous namespace
 
 namespace cv { namespace gapi {
@@ -212,10 +203,17 @@ namespace imgproc {
     G_TYPED_KERNEL(GBoundingRectMat, <GOpaque<Rect>(GMat)>,
                    "org.opencv.imgproc.shape.boundingRectMat") {
         static GOpaqueDesc outMeta(GMatDesc in) {
-            GAPI_Assert((in.depth == CV_8U && in.chan == 1) ||
-                        (isPointsVector(in.chan, in.size, in.depth, 2, CV_32S) ||
-                         isPointsVector(in.chan, in.size, in.depth, 2, CV_32F)));
-
+            if (in.depth == CV_8U)
+            {
+                GAPI_Assert(in.chan == 1);
+            }
+            else
+            {
+                GAPI_Assert (in.depth == CV_32S || in.depth == CV_32F);
+                int amount = detail::checkVector(in, 2u);
+                GAPI_Assert(amount != -1 &&
+                            "Input Mat can't be described as vector of 2-dimentional points");
+            }
             return empty_gopaque_desc();
         }
     };
@@ -237,7 +235,9 @@ namespace imgproc {
     G_TYPED_KERNEL(GFitLine2DMat, <GOpaque<Vec4f>(GMat,DistanceTypes,double,double,double)>,
                    "org.opencv.imgproc.shape.fitLine2DMat") {
         static GOpaqueDesc outMeta(GMatDesc in,DistanceTypes,double,double,double) {
-            GAPI_Assert(isPointsVector(in.chan, in.size, in.depth, 2, -1));
+            int amount = detail::checkVector(in, 2u);
+            GAPI_Assert(amount != -1 &&
+                        "Input Mat can't be described as vector of 2-dimentional points");
             return empty_gopaque_desc();
         }
     };
@@ -269,7 +269,9 @@ namespace imgproc {
     G_TYPED_KERNEL(GFitLine3DMat, <GOpaque<Vec6f>(GMat,DistanceTypes,double,double,double)>,
                    "org.opencv.imgproc.shape.fitLine3DMat") {
         static GOpaqueDesc outMeta(GMatDesc in,int,double,double,double) {
-            GAPI_Assert(isPointsVector(in.chan, in.size, in.depth, 3, -1));
+            int amount = detail::checkVector(in, 3u);
+            GAPI_Assert(amount != -1 &&
+                        "Input Mat can't be described as vector of 3-dimentional points");
             return empty_gopaque_desc();
         }
     };
@@ -501,10 +503,10 @@ kernel kernelY. The final result is returned.
 
 Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note In case of floating-point computation, rounding to nearest even is procedeed
+@note
+ - In case of floating-point computation, rounding to nearest even is procedeed
 if hardware supports it (if not - to nearest value).
-
-@note Function textual ID is "org.opencv.imgproc.filters.sepfilter"
+ - Function textual ID is "org.opencv.imgproc.filters.sepfilter"
 @param src Source image.
 @param ddepth desired depth of the destination image (the following combinations of src.depth() and ddepth are supported:
 
@@ -543,9 +545,9 @@ anchor.y - 1)`.
 
 Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
 Output image must have the same size and number of channels an input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.filter2D"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.filter2D"
 
 @param src input image.
 @param ddepth desired depth of the destination image
@@ -580,9 +582,9 @@ algorithms, and so on). If you need to compute pixel sums over variable-size win
 
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.boxfilter"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.boxfilter"
 
 @param src Source image.
 @param dtype the output image depth (-1 to set the input image data type).
@@ -609,9 +611,9 @@ true, borderType)`.
 
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.blur"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.blur"
 
 @param src Source image.
 @param ksize blurring kernel size.
@@ -637,9 +639,9 @@ Output image must have the same type and number of channels an input image.
 
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.gaussianBlur"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.gaussianBlur"
 
 @param src input image;
 @param ksize Gaussian kernel size. ksize.width and ksize.height can differ but they both must be
@@ -662,10 +664,10 @@ GAPI_EXPORTS GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX
 The function smoothes an image using the median filter with the \f$\texttt{ksize} \times
 \texttt{ksize}\f$ aperture. Each channel of a multi-channel image is processed independently.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
 The median filter uses cv::BORDER_REPLICATE internally to cope with border pixels, see cv::BorderTypes
-
-@note Function textual ID is "org.opencv.imgproc.filters.medianBlur"
+ - Function textual ID is "org.opencv.imgproc.filters.medianBlur"
 
 @param src input matrix (image)
 @param ksize aperture linear size; it must be odd and greater than 1, for example: 3, 5, 7 ...
@@ -683,9 +685,9 @@ shape of a pixel neighborhood over which the minimum is taken:
 Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.erode"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.erode"
 
 @param src input image
 @param kernel structuring element used for erosion; if `element=Mat()`, a `3 x 3` rectangular
@@ -707,7 +709,9 @@ The function erodes the source image using the rectangular structuring element w
 Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.erode"
 
 @param src input image
 @param iterations number of times erosion is applied.
@@ -728,9 +732,9 @@ shape of a pixel neighborhood over which the maximum is taken:
 Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.dilate"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.dilate"
 
 @param src input image.
 @param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
@@ -755,9 +759,9 @@ shape of a pixel neighborhood over which the maximum is taken:
 Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
 Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
 Output image must have the same type, size, and number of channels as the input image.
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.dilate"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.dilate"
 
 @param src input image.
 @param iterations number of times dilation is applied.
@@ -778,7 +782,12 @@ basic operations.
 Any of the operations can be done in-place. In case of multi-channel images, each channel is
 processed independently.
 
-@note Function textual ID is "org.opencv.imgproc.filters.morphologyEx"
+@note
+ - Function textual ID is "org.opencv.imgproc.filters.morphologyEx"
+ - The number of iterations is the number of times erosion or dilatation operation will be
+applied. For instance, an opening operation (#MORPH_OPEN) with two iterations is equivalent to
+apply successively: erode -> erode -> dilate -> dilate
+(and not erode -> dilate -> erode -> dilate).
 
 @param src Input image.
 @param op Type of a morphological operation, see #MorphTypes
@@ -790,10 +799,6 @@ the kernel center.
 @param borderValue Border value in case of a constant border. The default value has a special
 meaning.
 @sa  dilate, erode, getStructuringElement
-@note The number of iterations is the number of times erosion or dilatation operation will be
-applied. For instance, an opening operation (#MORPH_OPEN) with two iterations is equivalent to
-apply successively: erode -> erode -> dilate -> dilate
-(and not erode -> dilate -> erode -> dilate).
  */
 GAPI_EXPORTS GMat morphologyEx(const GMat &src, const MorphTypes op, const Mat &kernel,
                                const Point       &anchor      = Point(-1,-1),
@@ -830,9 +835,9 @@ The second case corresponds to a kernel of:
 
 \f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
 
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.sobel"
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.sobel"
 
 @param src input image.
 @param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
@@ -881,11 +886,10 @@ The second case corresponds to a kernel of:
 
 \f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
 
-@note First returned matrix correspons to dx derivative while the second one to dy.
-
-@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
-
-@note Function textual ID is "org.opencv.imgproc.filters.sobelxy"
+@note
+ - First returned matrix correspons to dx derivative while the second one to dy.
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.sobelxy"
 
 @param src input image.
 @param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
@@ -1008,11 +1012,11 @@ described in @cite Shi94
 
 The function can be used to initialize a point-based tracker of an object.
 
-@note If the function is called with different values A and B of the parameter qualityLevel , and
+@note
+ - If the function is called with different values A and B of the parameter qualityLevel , and
 A \> B, the vector of returned corners with qualityLevel=A will be the prefix of the output vector
 with qualityLevel=B .
-
-@note Function textual ID is "org.opencv.imgproc.feature.goodFeaturesToTrack"
+ - Function textual ID is "org.opencv.imgproc.feature.goodFeaturesToTrack"
 
 @param image Input 8-bit or floating-point 32-bit, single-channel image.
 @param maxCorners Maximum number of corners to return. If there are more corners than are found,
@@ -1057,9 +1061,9 @@ The function equalizes the histogram of the input image using the following algo
 - Transform the image using \f$H'\f$ as a look-up table: \f$\texttt{dst}(x,y) = H'(\texttt{src}(x,y))\f$
 
 The algorithm normalizes the brightness and increases the contrast of the image.
-@note The returned image is of the same size and type as input.
-
-@note Function textual ID is "org.opencv.imgproc.equalizeHist"
+@note
+ - The returned image is of the same size and type as input.
+ - Function textual ID is "org.opencv.imgproc.equalizeHist"
 
 @param src Source 8-bit single channel image.
  */
@@ -1119,8 +1123,9 @@ image of labels ( @ref CV_32SC1 ). If #RETR_FLOODFILL -- @ref CV_32SC1 supports
 contours are extracted from the image ROI and then they should be analyzed in the whole image
 context.
 
-@return GArray of detected contours. Each contour is stored as a GArray of points.
-@return Optional output GArray of cv::Vec4i, containing information about the image topology.
+@return
+ - GArray of detected contours. Each contour is stored as a GArray of points.
+ - Optional output GArray of cv::Vec4i, containing information about the image topology.
 It has as many elements as the number of contours. For each i-th contour contours[i], the elements
 hierarchy[i][0] , hierarchy[i][1] , hierarchy[i][2] , and hierarchy[i][3] are set to 0-based
 indices in contours of the next and previous contours at the same hierarchical level, the first
@@ -1144,16 +1149,16 @@ of gray-scale image.
 The function calculates and returns the minimal up-right bounding rectangle for the specified
 point set or non-zero pixels of gray-scale image.
 
-@note Function textual ID is "org.opencv.imgproc.shape.boundingRectMat"
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.boundingRectMat"
+ - In case of a 2D points' set given, Mat should be 2-dimensional, have a single row or column
+if there are 2 channels, or have 2 columns if there is a single channel. Mat should have either
+@ref CV_32S or @ref CV_32F depth
 
 @param src Input gray-scale image @ref CV_8UC1; or input set of @ref CV_32S or @ref CV_32F
 2D points stored in Mat.
-
-@note In case of a 2D points' set given, Mat should be 2-dimensional, have a single row or column
-if there are 2 channels, or have 2 columns if there is a single channel. Mat should have either
-@ref CV_32S or @ref CV_32F depth
  */
-GAPI_EXPORTS GOpaque<Rect> boundingRect(const GMat& src);
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GMat& src);
 
 /** @overload
 
@@ -1163,7 +1168,7 @@ Calculates the up-right bounding rectangle of a point set.
 
 @param src Input 2D point set, stored in std::vector<cv::Point2i>.
  */
-GAPI_EXPORTS GOpaque<Rect> boundingRect(const GArray<Point2i>& src);
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GArray<Point2i>& src);
 
 /** @overload
 
@@ -1197,14 +1202,13 @@ The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-esti
 that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
 weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
 
-@note Function textual ID is "org.opencv.imgproc.shape.fitLine2DMat"
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.fitLine2DMat"
+ - In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
+or column if there are N channels, or have N columns if there is a single channel.
 
 @param src Input set of 2D points stored in one of possible containers: Mat,
 std::vector<cv::Point2i>, std::vector<cv::Point2f>, std::vector<cv::Point2d>.
-
-@note In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
-or column if there are N channels, or have N columns if there is a single channel.
-
 @param distType Distance used by the M-estimator, see #DistanceTypes. @ref DIST_USER
 and @ref DIST_C are not suppored.
 @param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
@@ -1270,14 +1274,13 @@ The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-esti
 that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
 weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
 
-@note Function textual ID is "org.opencv.imgproc.shape.fitLine3DMat"
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.fitLine3DMat"
+ - In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
+or column if there are N channels, or have N columns if there is a single channel.
 
 @param src Input set of 3D points stored in one of possible containers: Mat,
 std::vector<cv::Point3i>, std::vector<cv::Point3f>, std::vector<cv::Point3d>.
-
-@note In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
-or column if there are N channels, or have N columns if there is a single channel.
-
 @param distType Distance used by the M-estimator, see #DistanceTypes. @ref DIST_USER
 and @ref DIST_C are not suppored.
 @param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
@@ -1341,6 +1344,7 @@ Output image is 8-bit unsigned 3-channel image @ref CV_8UC3.
 GAPI_EXPORTS GMat BGR2RGB(const GMat& src);
 
 /** @brief Converts an image from RGB color space to gray-scaled.
+
 The conventional ranges for R, G, and B channel values are 0 to 255.
 Resulting gray color value computed as
 \f[\texttt{dst} (I)= \texttt{0.299} * \texttt{src}(I).R + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.114} * \texttt{src}(I).B \f]
@@ -1367,6 +1371,7 @@ Resulting gray color value computed as
 GAPI_EXPORTS GMat RGB2Gray(const GMat& src, float rY, float gY, float bY);
 
 /** @brief Converts an image from BGR color space to gray-scaled.
+
 The conventional ranges for B, G, and R channel values are 0 to 255.
 Resulting gray color value computed as
 \f[\texttt{dst} (I)= \texttt{0.114} * \texttt{src}(I).B + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.299} * \texttt{src}(I).R \f]
diff --git a/modules/gapi/include/opencv2/gapi/infer.hpp b/modules/gapi/include/opencv2/gapi/infer.hpp
index b850775a62..6e71f59df9 100644
--- a/modules/gapi/include/opencv2/gapi/infer.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2019-2020 Intel Corporation
+// Copyright (C) 2019-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_INFER_HPP
@@ -16,6 +16,7 @@
 #include <utility> // tuple
 #include <type_traits> // is_same, false_type
 
+#include <opencv2/gapi/util/util.hpp> // all_satisfy
 #include <opencv2/gapi/util/any.hpp>  // any<>
 #include <opencv2/gapi/gkernel.hpp>   // GKernelType[M], GBackend
 #include <opencv2/gapi/garg.hpp>      // GArg
@@ -27,40 +28,160 @@ namespace cv {
 template<typename, typename> class GNetworkType;
 
 namespace detail {
-    template<typename, typename>
-    struct valid_infer2_types;
 
-    // Terminal case 1 (50/50 success)
-    template<typename T>
-    struct valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> > {
-        // By default, Nets are limited to GMat argument types only
-        // for infer2, every GMat argument may translate to either
-        // GArray<GMat> or GArray<Rect>. GArray<> part is stripped
-        // already at this point.
-        static constexpr const auto value =
-                std::is_same<typename std::decay<T>::type, cv::GMat>::value
-             || std::is_same<typename std::decay<T>::type, cv::Rect>::value;
+// Infer ///////////////////////////////////////////////////////////////////////
+template<typename T>
+struct accepted_infer_types {
+    static constexpr const auto value =
+            std::is_same<typename std::decay<T>::type, cv::GMat>::value
+         || std::is_same<typename std::decay<T>::type, cv::GFrame>::value;
+};
+
+template<typename... Ts>
+using valid_infer_types = all_satisfy<accepted_infer_types, Ts...>;
+
+// Infer2 //////////////////////////////////////////////////////////////////////
+
+template<typename, typename>
+struct valid_infer2_types;
+
+// Terminal case 1 (50/50 success)
+template<typename T>
+struct valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> > {
+    // By default, Nets are limited to GMat argument types only
+    // for infer2, every GMat argument may translate to either
+    // GArray<GMat> or GArray<Rect>. GArray<> part is stripped
+    // already at this point.
+    static constexpr const auto value =
+            std::is_same<typename std::decay<T>::type, cv::GMat>::value
+         || std::is_same<typename std::decay<T>::type, cv::Rect>::value;
+};
+
+// Terminal case 2 (100% failure)
+template<typename... Ts>
+struct valid_infer2_types< std::tuple<>, std::tuple<Ts...> >
+    : public std::false_type {
+};
+
+// Terminal case 3 (100% failure)
+template<typename... Ns>
+struct valid_infer2_types< std::tuple<Ns...>, std::tuple<> >
+    : public std::false_type {
+};
+
+// Recursion -- generic
+template<typename... Ns, typename T, typename...Ts>
+struct valid_infer2_types< std::tuple<cv::GMat,Ns...>, std::tuple<T,Ts...> > {
+    static constexpr const auto value =
+           valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> >::value
+        && valid_infer2_types< std::tuple<Ns...>, std::tuple<Ts...> >::value;
+};
+
+// Struct stores network input/output names.
+// Used by infer<Generic>
+struct InOutInfo
+{
+    std::vector<std::string> in_names;
+    std::vector<std::string> out_names;
+};
+
+template <typename OutT>
+class GInferOutputsTyped
+{
+public:
+    GInferOutputsTyped() = default;
+    GInferOutputsTyped(std::shared_ptr<cv::GCall> call)
+        : m_priv(std::make_shared<Priv>(std::move(call)))
+    {
+    }
+
+    OutT at(const std::string& name)
+    {
+        auto it = m_priv->blobs.find(name);
+        if (it == m_priv->blobs.end()) {
+            // FIXME: Avoid modifying GKernel
+            auto shape = cv::detail::GTypeTraits<OutT>::shape;
+            m_priv->call->kernel().outShapes.push_back(shape);
+            m_priv->call->kernel().outCtors.emplace_back(cv::detail::GObtainCtor<OutT>::get());
+            auto out_idx = static_cast<int>(m_priv->blobs.size());
+            it = m_priv->blobs.emplace(name,
+                    cv::detail::Yield<OutT>::yield(*(m_priv->call), out_idx)).first;
+            m_priv->info->out_names.push_back(name);
+        }
+        return it->second;
+    }
+private:
+    struct Priv
+    {
+        Priv(std::shared_ptr<cv::GCall> c)
+            : call(std::move(c)), info(cv::util::any_cast<InOutInfo>(&call->params()))
+        {
+        }
+
+        std::shared_ptr<cv::GCall> call;
+        InOutInfo* info = nullptr;
+        std::unordered_map<std::string, OutT> blobs;
     };
 
-    // Terminal case 2 (100% failure)
-    template<typename... Ts>
-    struct valid_infer2_types< std::tuple<>, std::tuple<Ts...> >
-        : public std::false_type {
+    std::shared_ptr<Priv> m_priv;
+};
+
+template <typename... Ts>
+class GInferInputsTyped
+{
+public:
+    GInferInputsTyped()
+        : m_priv(std::make_shared<Priv>())
+    {
+    }
+
+    template <typename U>
+    void setInput(const std::string& name, U in)
+    {
+        m_priv->blobs.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(name),
+                              std::forward_as_tuple(in));
+    }
+
+    using StorageT = cv::util::variant<Ts...>;
+    StorageT& operator[](const std::string& name) {
+        return m_priv->blobs[name];
+    }
+
+    using Map = std::unordered_map<std::string, StorageT>;
+    const Map& getBlobs() const {
+        return m_priv->blobs;
+    }
+
+private:
+    struct Priv
+    {
+        std::unordered_map<std::string, StorageT> blobs;
     };
 
-    // Terminal case 3 (100% failure)
-    template<typename... Ns>
-    struct valid_infer2_types< std::tuple<Ns...>, std::tuple<> >
-        : public std::false_type {
-    };
+    std::shared_ptr<Priv> m_priv;
+};
+
+template<typename InferT>
+std::shared_ptr<cv::GCall> makeCall(const std::string         &tag,
+                                    std::vector<cv::GArg>    &&args,
+                                    std::vector<std::string> &&names,
+                                    cv::GKinds               &&kinds) {
+    auto call = std::make_shared<cv::GCall>(GKernel{
+                InferT::id(),
+                tag,
+                InferT::getOutMeta,
+                {}, // outShape will be filled later
+                std::move(kinds),
+                {}, // outCtors will be filled later
+            });
+
+    call->setArgs(std::move(args));
+    call->params() = cv::detail::InOutInfo{std::move(names), {}};
+
+    return call;
+}
 
-    // Recursion -- generic
-    template<typename... Ns, typename T, typename...Ts>
-    struct valid_infer2_types< std::tuple<cv::GMat,Ns...>, std::tuple<T,Ts...> > {
-        static constexpr const auto value =
-               valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> >::value
-            && valid_infer2_types< std::tuple<Ns...>, std::tuple<Ts...> >::value;
-    };
 } // namespace detail
 
 // TODO: maybe tuple_wrap_helper from util.hpp may help with this.
@@ -76,10 +197,6 @@ public:
     using API     = std::function<Result(Args...)>;
 
     using ResultL = std::tuple< cv::GArray<R>... >;
-    using APIList = std::function<ResultL(cv::GArray<cv::Rect>, Args...)>;
-
-    // FIXME: Args... must be limited to a single GMat
-    using APIRoi = std::function<Result(cv::GOpaque<cv::Rect>, Args...)>;
 };
 
 // Single-return-value network definition (specialized base class)
@@ -94,20 +211,48 @@ public:
     using API     = std::function<R(Args...)>;
 
     using ResultL = cv::GArray<R>;
-    using APIList = std::function<ResultL(cv::GArray<cv::Rect>, Args...)>;
+};
 
-    // FIXME: Args... must be limited to a single GMat
-    using APIRoi = std::function<Result(cv::GOpaque<cv::Rect>, Args...)>;
+// InferAPI: Accepts either GMat or GFrame for very individual network's input
+template<class Net, class... Ts>
+struct InferAPI {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<Ts...>::value
+          && std::tuple_size<typename Net::InArgs>::value == sizeof...(Ts)
+        , std::function<typename Net::Result(Ts...)>
+        >::type;
+};
+
+// InferAPIRoi: Accepts a rectangle and either GMat or GFrame
+template<class Net, class T>
+struct InferAPIRoi {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<T>::value
+          && std::tuple_size<typename Net::InArgs>::value == 1u
+          , std::function<typename Net::Result(cv::GOpaque<cv::Rect>, T)>
+        >::type;
+};
+
+// InferAPIList: Accepts a list of rectangles and list of GMat/GFrames;
+// crops every input.
+template<class Net, class... Ts>
+struct InferAPIList {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<Ts...>::value
+          && std::tuple_size<typename Net::InArgs>::value == sizeof...(Ts)
+        , std::function<typename Net::ResultL(cv::GArray<cv::Rect>, Ts...)>
+        >::type;
 };
 
 // APIList2 is also template to allow different calling options
 // (GArray<cv::Rect> vs GArray<cv::GMat> per input)
-template<class Net, class... Ts>
+template<class Net, typename T, class... Ts>
 struct InferAPIList2 {
     using type = typename std::enable_if
-        < cv::detail::valid_infer2_types< typename Net::InArgs
+        < detail::valid_infer_types<T>::value &&
+          cv::detail::valid_infer2_types< typename Net::InArgs
                                         , std::tuple<Ts...> >::value,
-          std::function<typename Net::ResultL(cv::GMat, cv::GArray<Ts>...)>
+          std::function<typename Net::ResultL(T, cv::GArray<Ts>...)>
         >::type;
 };
 
@@ -127,49 +272,6 @@ struct GInferBase {
     }
 };
 
-// Struct stores network input/output names.
-// Used by infer<Generic>
-struct InOutInfo
-{
-    std::vector<std::string> in_names;
-    std::vector<std::string> out_names;
-};
-
-/**
- * @{
- * @brief G-API object used to collect network inputs
- */
-class GAPI_EXPORTS_W_SIMPLE GInferInputs
-{
-using Map = std::unordered_map<std::string, GMat>;
-public:
-    GAPI_WRAP GInferInputs();
-    GAPI_WRAP void setInput(const std::string& name, const cv::GMat& value);
-
-    cv::GMat& operator[](const std::string& name);
-    const Map& getBlobs() const;
-
-private:
-    std::shared_ptr<Map> in_blobs;
-};
-/** @} */
-
-/**
- * @{
- * @brief G-API object used to collect network outputs
- */
-struct GAPI_EXPORTS_W_SIMPLE GInferOutputs
-{
-public:
-    GAPI_WRAP GInferOutputs() = default;
-    GInferOutputs(std::shared_ptr<cv::GCall> call);
-    GAPI_WRAP cv::GMat at(const std::string& name);
-
-private:
-    struct Priv;
-    std::shared_ptr<Priv> m_priv;
-};
-/** @} */
 // Base "InferROI" kernel.
 // All notes from "Infer" kernel apply here as well.
 struct GInferROIBase {
@@ -206,11 +308,11 @@ struct GInferList2Base {
 // A generic inference kernel. API (::on()) is fully defined by the Net
 // template parameter.
 // Acts as a regular kernel in graph (via KernelTypeMedium).
-template<typename Net>
+template<typename Net, typename... Args>
 struct GInfer final
     : public GInferBase
-    , public detail::KernelTypeMedium< GInfer<Net>
-                                     , typename Net::API > {
+    , public detail::KernelTypeMedium< GInfer<Net, Args...>
+                                     , typename InferAPI<Net, Args...>::type > {
     using GInferBase::getOutMeta; // FIXME: name lookup conflict workaround?
 
     static constexpr const char* tag() { return Net::tag(); }
@@ -218,11 +320,11 @@ struct GInfer final
 
 // A specific roi-inference kernel. API (::on()) is fixed here and
 // verified against Net.
-template<typename Net>
+template<typename Net, typename T>
 struct GInferROI final
     : public GInferROIBase
-    , public detail::KernelTypeMedium< GInferROI<Net>
-                                     , typename Net::APIRoi > {
+    , public detail::KernelTypeMedium< GInferROI<Net, T>
+                                     , typename InferAPIRoi<Net, T>::type > {
     using GInferROIBase::getOutMeta; // FIXME: name lookup conflict workaround?
 
     static constexpr const char* tag() { return Net::tag(); }
@@ -231,11 +333,11 @@ struct GInferROI final
 
 // A generic roi-list inference kernel. API (::on()) is derived from
 // the Net template parameter (see more in infer<> overload).
-template<typename Net>
+template<typename Net, typename... Args>
 struct GInferList final
     : public GInferListBase
-    , public detail::KernelTypeMedium< GInferList<Net>
-                                     , typename Net::APIList > {
+    , public detail::KernelTypeMedium< GInferList<Net, Args...>
+                                     , typename InferAPIList<Net, Args...>::type > {
     using GInferListBase::getOutMeta; // FIXME: name lookup conflict workaround?
 
     static constexpr const char* tag() { return Net::tag(); }
@@ -246,16 +348,100 @@ struct GInferList final
 // overload).
 // Takes an extra variadic template list to reflect how this network
 // was called (with Rects or GMats as array parameters)
-template<typename Net, typename... Args>
+template<typename Net, typename T, typename... Args>
 struct GInferList2 final
     : public GInferList2Base
-    , public detail::KernelTypeMedium< GInferList2<Net, Args...>
-                                     , typename InferAPIList2<Net, Args...>::type > {
+    , public detail::KernelTypeMedium< GInferList2<Net, T, Args...>
+                                     , typename InferAPIList2<Net, T, Args...>::type > {
     using GInferList2Base::getOutMeta; // FIXME: name lookup conflict workaround?
 
     static constexpr const char* tag() { return Net::tag(); }
 };
 
+/**
+ * @brief G-API object used to collect network inputs
+ */
+using GInferInputs = cv::detail::GInferInputsTyped<cv::GMat, cv::GFrame>;
+
+/**
+ * @brief G-API object used to collect the list of network inputs
+ */
+using GInferListInputs = cv::detail::GInferInputsTyped<cv::GArray<cv::GMat>, cv::GArray<cv::Rect>>;
+
+/**
+ * @brief G-API object used to collect network outputs
+ */
+using GInferOutputs = cv::detail::GInferOutputsTyped<cv::GMat>;
+
+/**
+ * @brief G-API object used to collect the list of network outputs
+ */
+using GInferListOutputs = cv::detail::GInferOutputsTyped<cv::GArray<cv::GMat>>;
+
+namespace detail {
+void inline unpackBlobs(const cv::GInferInputs::Map& blobs,
+                        std::vector<cv::GArg>& args,
+                        std::vector<std::string>& names,
+                        cv::GKinds& kinds)
+{
+    for (auto&& p : blobs) {
+        names.emplace_back(p.first);
+        switch (p.second.index()) {
+            case cv::GInferInputs::StorageT::index_of<cv::GMat>():
+                args.emplace_back(cv::util::get<cv::GMat>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_MAT);
+                break;
+            case cv::GInferInputs::StorageT::index_of<cv::GFrame>():
+                args.emplace_back(cv::util::get<cv::GFrame>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_UNKNOWN);
+                break;
+            default:
+                GAPI_Assert(false);
+        }
+    }
+}
+
+template <typename InferType>
+struct InferROITraits;
+
+template <>
+struct InferROITraits<GInferROIBase>
+{
+    using outType = cv::GInferOutputs;
+    using inType  = cv::GOpaque<cv::Rect>;
+};
+
+template <>
+struct InferROITraits<GInferListBase>
+{
+    using outType = cv::GInferListOutputs;
+    using inType  = cv::GArray<cv::Rect>;
+};
+
+template<typename InferType>
+typename InferROITraits<InferType>::outType
+inferGenericROI(const std::string& tag,
+         const typename InferROITraits<InferType>::inType& in,
+         const cv::GInferInputs& inputs)
+{
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
+
+    args.emplace_back(in);
+    kinds.emplace_back(cv::detail::OpaqueKind::CV_RECT);
+
+    unpackBlobs(inputs.getBlobs(), args, names, kinds);
+
+    auto call = cv::detail::makeCall<InferType>(tag,
+                                                std::move(args),
+                                                std::move(names),
+                                                std::move(kinds));
+
+    return {std::move(call)};
+}
+
+} // namespace detail
 } // namespace cv
 
 // FIXME: Probably the <API> signature makes a function/tuple/function round-trip
@@ -280,9 +466,9 @@ namespace gapi {
  *   objects of appropriate type is returned.
  * @sa  G_API_NET()
  */
-template<typename Net>
-typename Net::Result infer(cv::GOpaque<cv::Rect> roi, cv::GMat in) {
-    return GInferROI<Net>::on(roi, in);
+template<typename Net, typename T>
+typename Net::Result infer(cv::GOpaque<cv::Rect> roi, T in) {
+    return GInferROI<Net, T>::on(roi, in);
 }
 
 /** @brief Calculates responses for the specified network (template
@@ -300,7 +486,7 @@ typename Net::Result infer(cv::GOpaque<cv::Rect> roi, cv::GMat in) {
  */
 template<typename Net, typename... Args>
 typename Net::ResultL infer(cv::GArray<cv::Rect> roi, Args&&... args) {
-    return GInferList<Net>::on(roi, std::forward<Args>(args)...);
+    return GInferList<Net, Args...>::on(roi, std::forward<Args>(args)...);
 }
 
 /** @brief Calculates responses for the specified network (template
@@ -320,11 +506,12 @@ typename Net::ResultL infer(cv::GArray<cv::Rect> roi, Args&&... args) {
  *   GArray<> objects is returned with the appropriate types inside.
  * @sa  G_API_NET()
  */
-template<typename Net, typename... Args>
-typename Net::ResultL infer2(cv::GMat image, cv::GArray<Args>... args) {
+
+template<typename Net, typename T, typename... Args>
+typename Net::ResultL infer2(T image, cv::GArray<Args>... args) {
     // FIXME: Declared as "2" because in the current form it steals
     // overloads from the regular infer
-    return GInferList2<Net, Args...>::on(image, args...);
+    return GInferList2<Net, T, Args...>::on(image, args...);
 }
 
 /**
@@ -340,7 +527,7 @@ typename Net::ResultL infer2(cv::GMat image, cv::GArray<Args>... args) {
  */
 template<typename Net, typename... Args>
 typename Net::Result infer(Args&&... args) {
-    return GInfer<Net>::on(std::forward<Args>(args)...);
+    return GInfer<Net, Args...>::on(std::forward<Args>(args)...);
 }
 
 /**
@@ -355,38 +542,98 @@ struct Generic { };
  * @param inputs networks's inputs
  * @return a GInferOutputs
  */
-template<typename T = Generic> GInferOutputs
-infer(const std::string& tag, const GInferInputs& inputs)
+template<typename T = Generic> cv::GInferOutputs
+infer(const std::string& tag, const cv::GInferInputs& inputs)
 {
-    std::vector<GArg> input_args;
-    std::vector<std::string> input_names;
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
 
-    const auto& blobs = inputs.getBlobs();
-    for (auto&& p : blobs)
-    {
-        input_names.push_back(p.first);
-        input_args.emplace_back(p.second);
-    }
+    cv::detail::unpackBlobs(inputs.getBlobs(), args, names, kinds);
 
-    GKinds kinds(blobs.size(), cv::detail::OpaqueKind::CV_MAT);
-    auto call = std::make_shared<cv::GCall>(GKernel{
-                GInferBase::id(),
-                tag,
-                GInferBase::getOutMeta,
-                {}, // outShape will be filled later
-                std::move(kinds),
-                {}, // outCtors will be filled later
-            });
+    auto call = cv::detail::makeCall<GInferBase>(tag,
+                                                 std::move(args),
+                                                 std::move(names),
+                                                 std::move(kinds));
 
-    call->setArgs(std::move(input_args));
-    call->params() = InOutInfo{input_names, {}};
-
-    return GInferOutputs{std::move(call)};
+    return cv::GInferOutputs{std::move(call)};
 }
 
-GAPI_EXPORTS_W inline GInferOutputs infer(const String& name, const GInferInputs& inputs)
+/** @brief Calculates response for the generic network
+ *     for the specified region in the source image.
+ *     Currently expects a single-input network only.
+ *
+ * @param tag a network tag
+ * @param roi a an object describing the region of interest
+ *   in the source image. May be calculated in the same graph dynamically.
+ * @param inputs networks's inputs
+ * @return a cv::GInferOutputs
+ */
+template<typename T = Generic> cv::GInferOutputs
+infer(const std::string& tag, const cv::GOpaque<cv::Rect>& roi, const cv::GInferInputs& inputs)
 {
-    return infer<Generic>(name, inputs);
+    return cv::detail::inferGenericROI<GInferROIBase>(tag, roi, inputs);
+}
+
+/** @brief Calculates responses for the specified network
+ *     for every region in the source image.
+ *
+ * @param tag a network tag
+ * @param rois a list of rectangles describing regions of interest
+ *   in the source image. Usually an output of object detector or tracker.
+ * @param inputs networks's inputs
+ * @return a cv::GInferListOutputs
+ */
+template<typename T = Generic> cv::GInferListOutputs
+infer(const std::string& tag, const cv::GArray<cv::Rect>& rois, const cv::GInferInputs& inputs)
+{
+    return cv::detail::inferGenericROI<GInferListBase>(tag, rois, inputs);
+}
+
+/** @brief Calculates responses for the specified network
+ *     for every region in the source image, extended version.
+ *
+ * @param tag a network tag
+ * @param in a source image containing regions of interest.
+ * @param inputs networks's inputs
+ * @return a cv::GInferListOutputs
+ */
+template<typename T = Generic, typename Input>
+typename std::enable_if<cv::detail::accepted_infer_types<Input>::value, cv::GInferListOutputs>::type
+infer2(const std::string& tag,
+       const Input& in,
+       const cv::GInferListInputs& inputs)
+{
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
+
+    args.emplace_back(in);
+    auto k = cv::detail::GOpaqueTraits<Input>::kind;
+    kinds.emplace_back(k);
+
+    for (auto&& p : inputs.getBlobs()) {
+        names.emplace_back(p.first);
+        switch (p.second.index()) {
+            case cv::GInferListInputs::StorageT::index_of<cv::GArray<cv::GMat>>():
+                args.emplace_back(cv::util::get<cv::GArray<cv::GMat>>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_MAT);
+                break;
+            case cv::GInferListInputs::StorageT::index_of<cv::GArray<cv::Rect>>():
+                args.emplace_back(cv::util::get<cv::GArray<cv::Rect>>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_RECT);
+                break;
+            default:
+                GAPI_Assert(false);
+        }
+    }
+
+    auto call = cv::detail::makeCall<GInferList2Base>(tag,
+                                                      std::move(args),
+                                                      std::move(names),
+                                                      std::move(kinds));
+
+    return cv::GInferListOutputs{std::move(call)};
 }
 
 } // namespace gapi
@@ -418,8 +665,8 @@ struct GAPI_EXPORTS GNetParam {
  * @sa cv::gapi::networks
  */
 struct GAPI_EXPORTS_W_SIMPLE GNetPackage {
-    GAPI_WRAP GNetPackage() : GNetPackage({}) {}
-    explicit GNetPackage(std::initializer_list<GNetParam> &&ii);
+    GAPI_WRAP GNetPackage() = default;
+    explicit GNetPackage(std::initializer_list<GNetParam> ii);
     std::vector<GBackend> backends() const;
     std::vector<GNetParam> networks;
 };
diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
index 53e31fbb09..60137c960c 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
@@ -8,6 +8,7 @@
 #define OPENCV_GAPI_INFER_IE_HPP
 
 #include <unordered_map>
+#include <unordered_set>
 #include <string>
 #include <array>
 #include <tuple> // tuple, tuple_size
@@ -67,6 +68,12 @@ namespace detail {
         Kind kind;
         bool is_generic;
         IEConfig config;
+
+        std::map<std::string, std::vector<std::size_t>> reshape_table;
+        std::unordered_set<std::string> layer_names_to_reshape;
+
+        // NB: Number of asyncrhonious infer requests
+        size_t nireq;
     };
 } // namespace detail
 
@@ -91,7 +98,10 @@ public:
               , std::tuple_size<typename Net::OutArgs>::value // num_out
               , detail::ParamDesc::Kind::Load
               , false
-              , {}} {
+              , {}
+              , {}
+              , {}
+              , 1u} {
     };
 
     Params(const std::string &model,
@@ -101,7 +111,10 @@ public:
               , std::tuple_size<typename Net::OutArgs>::value // num_out
               , detail::ParamDesc::Kind::Import
               , false
-              , {}} {
+              , {}
+              , {}
+              , {}
+              , 1u} {
     };
 
     Params<Net>& cfgInputLayers(const typename PortCfg<Net>::In &ll) {
@@ -137,6 +150,42 @@ public:
         return *this;
     }
 
+    Params& cfgNumRequests(size_t nireq) {
+        GAPI_Assert(nireq > 0 && "Number of infer requests must be greater than zero!");
+        desc.nireq = nireq;
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(std::map<std::string, std::vector<std::size_t>>&& reshape_table) {
+        desc.reshape_table = std::move(reshape_table);
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(const std::map<std::string, std::vector<std::size_t>>& reshape_table) {
+        desc.reshape_table = reshape_table;
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(std::string&& layer_name, std::vector<size_t>&& layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(const std::string& layer_name, const std::vector<size_t>& layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(std::unordered_set<std::string>&& layer_names) {
+        desc.layer_names_to_reshape = std::move(layer_names);
+        return *this;
+    }
+
+    Params<Net>& cfgInputReshape(const std::unordered_set<std::string>& layer_names) {
+        desc.layer_names_to_reshape = layer_names;
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend()    const { return cv::gapi::ie::backend();  }
     std::string   tag()        const { return Net::tag(); }
@@ -154,13 +203,13 @@ public:
            const std::string &model,
            const std::string &weights,
            const std::string &device)
-        : desc{ model, weights, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Load, true, {}}, m_tag(tag) {
+        : desc{ model, weights, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u}, m_tag(tag) {
     };
 
     Params(const std::string &tag,
            const std::string &model,
            const std::string &device)
-        : desc{ model, {}, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Import, true, {}}, m_tag(tag) {
+        : desc{ model, {}, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u}, m_tag(tag) {
     };
 
     Params& pluginConfig(IEConfig&& cfg) {
@@ -173,6 +222,19 @@ public:
         return *this;
     }
 
+    Params& constInput(const std::string &layer_name,
+                       const cv::Mat &data,
+                       TraitAs hint = TraitAs::TENSOR) {
+        desc.const_inputs[layer_name] = {data, hint};
+        return *this;
+    }
+
+    Params& cfgNumRequests(size_t nireq) {
+        GAPI_Assert(nireq > 0 && "Number of infer requests must be greater than zero!");
+        desc.nireq = nireq;
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend()    const { return cv::gapi::ie::backend();  }
     std::string   tag()        const { return m_tag; }
diff --git a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
index d61ceb3dca..3a4e35fb09 100644
--- a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
@@ -58,6 +58,8 @@ struct ParamDesc {
     PostProc custom_post_proc;
 
     std::vector<bool> normalize;
+
+    std::vector<std::string> names_to_remap;
 };
 } // namespace detail
 
@@ -86,7 +88,7 @@ public:
     };
 
     // BEGIN(G-API's network parametrization API)
-    GBackend      backend() const { return cv::gapi::onnx::backend();  }
+    GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return Net::tag(); }
     cv::util::any params()  const { return { desc }; }
     // END(G-API's network parametrization API)
@@ -115,13 +117,70 @@ public:
         return *this;
     }
 
-    Params<Net>& cfgPostProc(const std::vector<cv::GMatDesc> &outs,
+    /** @brief Configures graph output and sets the post processing function from user.
+
+    The function is used for the case of infer of networks with dynamic outputs.
+    Since these networks haven't known output parameters needs provide them for
+    construction of output of graph.
+    The function provides meta information of outputs and post processing function.
+    Post processing function is used for copy information from ONNX infer's result
+    to output of graph which is allocated by out meta information.
+
+    @param out_metas out meta information.
+    @param pp post processing function, which has two parameters. First is onnx
+    result, second is graph output. Both parameters is std::map that contain pair of
+    layer's name and cv::Mat.
+    @return reference to object of class Params.
+    */
+    Params<Net>& cfgPostProc(const std::vector<cv::GMatDesc> &out_metas,
                              const PostProc &pp) {
-        desc.out_metas = outs;
+        desc.out_metas        = out_metas;
         desc.custom_post_proc = pp;
         return *this;
     }
 
+    /** @overload
+    The function has rvalue parameters.
+    */
+    Params<Net>& cfgPostProc(std::vector<cv::GMatDesc> &&out_metas,
+                             PostProc &&pp) {
+        desc.out_metas        = std::move(out_metas);
+        desc.custom_post_proc = std::move(pp);
+        return *this;
+    }
+
+    /** @overload
+    The function has additional parameter names_to_remap. This parameter provides
+    information about output layers which will be used for infer and in post
+    processing function.
+
+    @param out_metas out meta information.
+    @param pp post processing function.
+    @param names_to_remap contains names of output layers. CNN's infer will be done on these layers.
+    Infer's result will be processed in post processing function using these names.
+    @return reference to object of class Params.
+    */
+    Params<Net>& cfgPostProc(const std::vector<cv::GMatDesc> &out_metas,
+                             const PostProc &pp,
+                             const std::vector<std::string> &names_to_remap) {
+        desc.out_metas        = out_metas;
+        desc.custom_post_proc = pp;
+        desc.names_to_remap   = names_to_remap;
+        return *this;
+    }
+
+    /** @overload
+    The function has rvalue parameters.
+    */
+    Params<Net>& cfgPostProc(std::vector<cv::GMatDesc> &&out_metas,
+                             PostProc &&pp,
+                             std::vector<std::string> &&names_to_remap) {
+        desc.out_metas        = std::move(out_metas);
+        desc.custom_post_proc = std::move(pp);
+        desc.names_to_remap   = std::move(names_to_remap);
+        return *this;
+    }
+
     Params<Net>& cfgNormalize(const typename PortCfg<Net>::Normalize &n) {
         desc.normalize.assign(n.begin(), n.end());
         return *this;
diff --git a/modules/gapi/include/opencv2/gapi/infer/parsers.hpp b/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
index 15742c6e55..3225c73831 100644
--- a/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
@@ -85,11 +85,11 @@ the larger side of the rectangle.
 @param filterOutOfBounds If provided true, out-of-frame boxes are filtered.
 @return a vector of detected bounding boxes.
 */
-GAPI_EXPORTS GArray<Rect> parseSSD(const GMat& in,
-                                   const GOpaque<Size>& inSz,
-                                   const float confidenceThreshold = 0.5f,
-                                   const bool alignmentToSquare = false,
-                                   const bool filterOutOfBounds = false);
+GAPI_EXPORTS_W GArray<Rect> parseSSD(const GMat& in,
+                                     const GOpaque<Size>& inSz,
+                                     const float confidenceThreshold = 0.5f,
+                                     const bool alignmentToSquare = false,
+                                     const bool filterOutOfBounds = false);
 
 /** @brief Parses output of Yolo network.
 
@@ -108,8 +108,8 @@ detection is smaller than confidence threshold, detection is rejected.
 relative box intersection area required for rejecting the box with a smaller confidence.
 If 1.f, nms is not performed and no boxes are rejected.
 @param anchors Anchors Yolo network was trained with.
-@note The default anchor values are taken from openvinotoolkit docs:
-https://docs.openvinotoolkit.org/latest/omz_models_intel_yolo_v2_tiny_vehicle_detection_0001_description_yolo_v2_tiny_vehicle_detection_0001.html#output.
+@note The default anchor values are specified for YOLO v2 Tiny as described in Intel Open Model Zoo
+<a href="https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/yolo-v2-tiny-tf/yolo-v2-tiny-tf.md">documentation</a>.
 @return a tuple with a vector of detected boxes and a vector of appropriate labels.
 */
 GAPI_EXPORTS std::tuple<GArray<Rect>, GArray<int>> parseYolo(const GMat& in,
diff --git a/modules/gapi/include/opencv2/gapi/media.hpp b/modules/gapi/include/opencv2/gapi/media.hpp
index f27cb80913..3d7f5a5b65 100644
--- a/modules/gapi/include/opencv2/gapi/media.hpp
+++ b/modules/gapi/include/opencv2/gapi/media.hpp
@@ -30,9 +30,21 @@ public:
     View access(Access) const;
     cv::GFrameDesc desc() const;
 
+    // Cast underlying MediaFrame adapter to the particular adapter type,
+    // return nullptr if underlying type is different
+    template<typename T> T* get() const
+    {
+        static_assert(std::is_base_of<IAdapter, T>::value,
+                      "T is not derived from cv::MediaFrame::IAdapter!");
+        auto* adapter = getAdapter();
+        GAPI_Assert(adapter != nullptr);
+        return dynamic_cast<T*>(adapter);
+    }
+
 private:
     struct Priv;
     std::shared_ptr<Priv> m;
+    IAdapter* getAdapter() const;
 };
 
 template<class T, class... Args>
diff --git a/modules/gapi/include/opencv2/gapi/own/assert.hpp b/modules/gapi/include/opencv2/gapi/own/assert.hpp
index d0e0f1c3ff..d50543fdac 100644
--- a/modules/gapi/include/opencv2/gapi/own/assert.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/assert.hpp
@@ -2,16 +2,28 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2020 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_OWN_ASSERT_HPP
 #define OPENCV_GAPI_OWN_ASSERT_HPP
 
+#include <opencv2/gapi/util/compiler_hints.hpp>
+
+#define GAPI_DbgAssertNoOp(expr) {                  \
+    constexpr bool _assert_tmp = false && (expr);   \
+    cv::util::suppress_unused_warning(_assert_tmp); \
+}
+
 #if !defined(GAPI_STANDALONE)
 #include <opencv2/core/base.hpp>
 #define GAPI_Assert CV_Assert
-#define GAPI_DbgAssert CV_DbgAssert
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+#  define GAPI_DbgAssert CV_DbgAssert
+#else
+#  define GAPI_DbgAssert(expr) GAPI_DbgAssertNoOp(expr)
+#endif
 
 #else
 #include <stdexcept>
@@ -33,7 +45,7 @@ namespace detail
 
 
 #ifdef NDEBUG
-#  define GAPI_DbgAssert(expr)
+#  define GAPI_DbgAssert(expr) GAPI_DbgAssertNoOp(expr)
 #else
 #  define GAPI_DbgAssert(expr) GAPI_Assert(expr)
 #endif
diff --git a/modules/gapi/include/opencv2/gapi/own/exports.hpp b/modules/gapi/include/opencv2/gapi/own/exports.hpp
index da42a3238c..1978991b75 100644
--- a/modules/gapi/include/opencv2/gapi/own/exports.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/exports.hpp
@@ -12,10 +12,12 @@
 #       include <opencv2/core/base.hpp>
 #       define GAPI_EXPORTS CV_EXPORTS
         /* special informative macros for wrapper generators */
+#       define GAPI_PROP CV_PROP
 #       define GAPI_WRAP CV_WRAP
 #       define GAPI_EXPORTS_W_SIMPLE CV_EXPORTS_W_SIMPLE
 #       define GAPI_EXPORTS_W CV_EXPORTS_W
 #   else
+#       define GAPI_PROP
 #       define GAPI_WRAP
 #       define GAPI_EXPORTS
 #       define GAPI_EXPORTS_W_SIMPLE
diff --git a/modules/gapi/include/opencv2/gapi/python/python.hpp b/modules/gapi/include/opencv2/gapi/python/python.hpp
new file mode 100644
index 0000000000..1c85d69d9f
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/python/python.hpp
@@ -0,0 +1,58 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PYTHON_API_HPP
+#define OPENCV_GAPI_PYTHON_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+namespace python {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+struct GPythonContext
+{
+    const cv::GArgs      &ins;
+    const cv::GMetaArgs  &in_metas;
+    const cv::GTypesInfo &out_info;
+};
+
+using Impl = std::function<cv::GRunArgs(const GPythonContext&)>;
+
+class GAPI_EXPORTS GPythonKernel
+{
+public:
+    GPythonKernel() = default;
+    GPythonKernel(Impl run);
+
+    cv::GRunArgs operator()(const GPythonContext& ctx);
+private:
+    Impl m_run;
+};
+
+class GAPI_EXPORTS GPythonFunctor : public cv::gapi::GFunctor
+{
+public:
+    using Meta = cv::GKernel::M;
+
+    GPythonFunctor(const char* id, const Meta &meta, const Impl& impl);
+
+    GKernelImpl    impl()    const override;
+    gapi::GBackend backend() const override;
+
+private:
+    GKernelImpl impl_;
+};
+
+} // namespace python
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_PYTHON_API_HPP
diff --git a/modules/gapi/include/opencv2/gapi/render/render.hpp b/modules/gapi/include/opencv2/gapi/render/render.hpp
index a4df304289..a84c26c810 100644
--- a/modules/gapi/include/opencv2/gapi/render/render.hpp
+++ b/modules/gapi/include/opencv2/gapi/render/render.hpp
@@ -97,6 +97,17 @@ void GAPI_EXPORTS render(cv::Mat& y_plane,
                          const Prims& prims,
                          cv::GCompileArgs&& args = {});
 
+/** @brief The function renders on the input media frame passed drawing primitivies
+
+@param frame input Media Frame :  @ref cv::MediaFrame.
+@param prims vector of drawing primitivies
+@param args graph compile time parameters
+*/
+void GAPI_EXPORTS render(cv::MediaFrame& frame,
+                         const Prims& prims,
+                         cv::GCompileArgs&& args = {});
+
+
 G_TYPED_KERNEL_M(GRenderNV12, <GMat2(cv::GMat,cv::GMat,cv::GArray<wip::draw::Prim>)>, "org.opencv.render.nv12")
 {
      static GMatDesc2 outMeta(GMatDesc y_plane, GMatDesc uv_plane, GArrayDesc)
@@ -113,6 +124,14 @@ G_TYPED_KERNEL(GRenderBGR, <cv::GMat(cv::GMat,cv::GArray<wip::draw::Prim>)>, "or
      }
 };
 
+G_TYPED_KERNEL(GRenderFrame, <cv::GFrame(cv::GFrame, cv::GArray<wip::draw::Prim>)>, "org.opencv.render.frame")
+{
+    static GFrameDesc outMeta(GFrameDesc desc, GArrayDesc)
+    {
+        return desc;
+    }
+};
+
 /** @brief Renders on 3 channels input
 
 Output image must be 8-bit unsigned planar 3-channel image
@@ -134,6 +153,17 @@ uv image must be 8-bit unsigned planar 2-channel image @ref CV_8UC2
 GAPI_EXPORTS GMat2 renderNV12(const GMat& y,
                               const GMat& uv,
                               const GArray<Prim>& prims);
+
+/** @brief Renders Media Frame
+
+Output media frame frame cv::MediaFrame
+
+@param m_frame input image: cv::MediaFrame @ref cv::MediaFrame
+@param prims draw primitives
+*/
+GAPI_EXPORTS GFrame renderFrame(const GFrame& m_frame,
+                                const GArray<Prim>& prims);
+
 //! @} gapi_draw_api
 
 } // namespace draw
diff --git a/modules/gapi/include/opencv2/gapi/s11n.hpp b/modules/gapi/include/opencv2/gapi/s11n.hpp
index 0e2c4c239b..5a64410e5a 100644
--- a/modules/gapi/include/opencv2/gapi/s11n.hpp
+++ b/modules/gapi/include/opencv2/gapi/s11n.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #ifndef OPENCV_GAPI_S11N_HPP
 #define OPENCV_GAPI_S11N_HPP
@@ -24,6 +24,8 @@ namespace detail {
 
     GAPI_EXPORTS cv::GRunArgs getRunArgs(const std::vector<char> &p);
 
+    GAPI_EXPORTS std::vector<std::string> getVectorOfStrings(const std::vector<char> &p);
+
     template<typename... Types>
     cv::GCompileArgs getCompileArgs(const std::vector<char> &p);
 
@@ -42,6 +44,7 @@ T deserialize(const std::vector<char> &p);
 GAPI_EXPORTS std::vector<char> serialize(const cv::GCompileArgs&);
 GAPI_EXPORTS std::vector<char> serialize(const cv::GMetaArgs&);
 GAPI_EXPORTS std::vector<char> serialize(const cv::GRunArgs&);
+GAPI_EXPORTS std::vector<char> serialize(const std::vector<std::string>&);
 
 template<> inline
 cv::GComputation deserialize(const std::vector<char> &p) {
@@ -58,6 +61,11 @@ cv::GRunArgs deserialize(const std::vector<char> &p) {
     return detail::getRunArgs(p);
 }
 
+template<> inline
+std::vector<std::string> deserialize(const std::vector<char> &p) {
+    return detail::getVectorOfStrings(p);
+}
+
 template<typename T, typename... Types> inline
 typename std::enable_if<std::is_same<T, GCompileArgs>::value, GCompileArgs>::
 type deserialize(const std::vector<char> &p) {
diff --git a/modules/gapi/include/opencv2/gapi/stereo.hpp b/modules/gapi/include/opencv2/gapi/stereo.hpp
new file mode 100644
index 0000000000..908045d4c7
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/stereo.hpp
@@ -0,0 +1,64 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distereoibution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STEREO_HPP
+#define OPENCV_GAPI_STEREO_HPP
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+
+enum class StereoOutputFormat {
+    DEPTH_FLOAT16,
+    DEPTH_FLOAT32,
+    DISPARITY_FIXED16_11_5,
+    DISPARITY_FIXED16_12_4
+};
+
+namespace calib3d {
+
+G_TYPED_KERNEL(GStereo, <GMat(GMat, GMat, const StereoOutputFormat)>, "org.opencv.stereo") {
+    static GMatDesc outMeta(const GMatDesc &left, const GMatDesc &right, const StereoOutputFormat of) {
+        GAPI_Assert(left.chan == 1);
+        GAPI_Assert(left.depth == CV_8U);
+
+        GAPI_Assert(right.chan == 1);
+        GAPI_Assert(right.depth == CV_8U);
+
+        switch(of) {
+            case StereoOutputFormat::DEPTH_FLOAT16:
+                return left.withDepth(CV_16FC1);
+            case StereoOutputFormat::DEPTH_FLOAT32:
+                return left.withDepth(CV_32FC1);
+            case StereoOutputFormat::DISPARITY_FIXED16_11_5:
+            case StereoOutputFormat::DISPARITY_FIXED16_12_4:
+                return left.withDepth(CV_16SC1);
+            default:
+                GAPI_Assert(false && "Unknown output format!");
+        }
+    }
+};
+
+} // namespace calib3d
+
+/** @brief Extract disparity/depth information depending on passed StereoOutputFormat argument.
+The function extracts disparity/depth information depending on passed StereoOutputFormat argument from
+given stereo-pair.
+
+@param left left 8-bit unsigned 1-channel image of @ref CV_8UC1 type
+@param right right 8-bit unsigned 1-channel image of @ref CV_8UC1 type
+@param of enum to specify output kind: depth or disparity and corresponding type
+*/
+GAPI_EXPORTS GMat stereo(const GMat& left,
+                         const GMat& right,
+                         const StereoOutputFormat of = StereoOutputFormat::DEPTH_FLOAT32);
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STEREO_HPP
diff --git a/modules/gapi/include/opencv2/gapi/streaming/format.hpp b/modules/gapi/include/opencv2/gapi/streaming/format.hpp
new file mode 100644
index 0000000000..c9d2fa3e0a
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/streaming/format.hpp
@@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_GSTREAMING_FORMAT_HPP
+#define OPENCV_GAPI_GSTREAMING_FORMAT_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+GAPI_EXPORTS cv::gapi::GKernelPackage kernels();
+
+G_API_OP(GBGR, <GMat(GFrame)>, "org.opencv.streaming.BGR")
+{
+    static GMatDesc outMeta(const GFrameDesc& in) { return GMatDesc{CV_8U, 3, in.size}; }
+};
+
+G_API_OP(GY, <GMat(GFrame)>, "org.opencv.streaming.Y") {
+    static GMatDesc outMeta(const GFrameDesc& frameDesc) {
+        return GMatDesc { CV_8U, 1, frameDesc.size , false };
+    }
+};
+
+G_API_OP(GUV, <GMat(GFrame)>, "org.opencv.streaming.UV") {
+    static GMatDesc outMeta(const GFrameDesc& frameDesc) {
+        return GMatDesc { CV_8U, 2, cv::Size(frameDesc.size.width / 2, frameDesc.size.height / 2),
+                          false };
+    }
+};
+
+/** @brief Gets bgr plane from input frame
+
+@note Function textual ID is "org.opencv.streaming.BGR"
+
+@param in Input frame
+@return Image in BGR format
+*/
+GAPI_EXPORTS cv::GMat BGR(const cv::GFrame& in);
+
+/** @brief Extracts Y plane from media frame.
+
+Output image is 8-bit 1-channel image of @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.streaming.Y"
+
+@param frame input media frame.
+*/
+GAPI_EXPORTS GMat Y(const cv::GFrame& frame);
+
+/** @brief Extracts UV plane from media frame.
+
+Output image is 8-bit 2-channel image of @ref CV_8UC2.
+
+@note Function textual ID is "org.opencv.streaming.UV"
+
+@param frame input media frame.
+*/
+GAPI_EXPORTS GMat UV(const cv::GFrame& frame);
+} // namespace streaming
+
+//! @addtogroup gapi_transform
+//! @{
+/** @brief Makes a copy of the input image. Note that this copy may be not real
+(no actual data copied). Use this function to maintain graph contracts,
+e.g when graph's input needs to be passed directly to output, like in Streaming mode.
+
+@note Function textual ID is "org.opencv.streaming.copy"
+
+@param in Input image
+@return Copy of the input
+*/
+GAPI_EXPORTS GMat copy(const GMat& in);
+
+/** @brief Makes a copy of the input frame. Note that this copy may be not real
+(no actual data copied). Use this function to maintain graph contracts,
+e.g when graph's input needs to be passed directly to output, like in Streaming mode.
+
+@note Function textual ID is "org.opencv.streaming.copy"
+
+@param in Input frame
+@return Copy of the input
+*/
+GAPI_EXPORTS GFrame copy(const GFrame& in);
+//! @} gapi_transform
+
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMING_FORMAT_HPP
diff --git a/modules/gapi/include/opencv2/gapi/streaming/sync.hpp b/modules/gapi/include/opencv2/gapi/streaming/sync.hpp
new file mode 100644
index 0000000000..5801e6f00a
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/streaming/sync.hpp
@@ -0,0 +1,30 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_SYNC_HPP
+#define OPENCV_GAPI_STREAMING_SYNC_HPP
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+enum class sync_policy {
+    dont_sync,
+    drop
+};
+
+} // namespace streaming
+} // namespace gapi
+
+namespace detail {
+    template<> struct CompileArgTag<gapi::streaming::sync_policy> {
+        static const char* tag() { return "gapi.streaming.sync_policy"; }
+    };
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SYNC_HPP
diff --git a/modules/gapi/include/opencv2/gapi/util/copy_through_move.hpp b/modules/gapi/include/opencv2/gapi/util/copy_through_move.hpp
new file mode 100644
index 0000000000..1a1121eb21
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/util/copy_through_move.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP
+#define OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP
+
+#include <opencv2/gapi/util/type_traits.hpp> //decay_t
+
+namespace cv
+{
+namespace util
+{
+    //This is a tool to move initialize captures of a lambda in C++11
+    template<typename T>
+    struct copy_through_move_t{
+       T value;
+       const T& get() const {return value;}
+       T&       get()       {return value;}
+       copy_through_move_t(T&& g) : value(std::move(g)) {}
+       copy_through_move_t(copy_through_move_t&&) = default;
+       copy_through_move_t(copy_through_move_t const& lhs) : copy_through_move_t(std::move(const_cast<copy_through_move_t&>(lhs))) {}
+    };
+
+    template<typename T>
+    copy_through_move_t<util::decay_t<T>> copy_through_move(T&& t){
+        return std::forward<T>(t);
+    }
+} // namespace util
+} // namespace cv
+
+#endif /* OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP */
diff --git a/modules/gapi/include/opencv2/gapi/util/optional.hpp b/modules/gapi/include/opencv2/gapi/util/optional.hpp
index 1aa2b265d9..6c8ceebbda 100644
--- a/modules/gapi/include/opencv2/gapi/util/optional.hpp
+++ b/modules/gapi/include/opencv2/gapi/util/optional.hpp
@@ -35,9 +35,9 @@ namespace util
         // instead {}
         optional() {};
         optional(const optional&) = default;
-        explicit optional(T &&value) noexcept;
-        explicit optional(const T &value) noexcept;
-        optional(optional &&) noexcept;
+        explicit optional(T&&) noexcept;
+        explicit optional(const T&) noexcept;
+        optional(optional&&) noexcept;
         // TODO: optional(nullopt_t) noexcept;
         // TODO: optional(const optional<U> &)
         // TODO: optional(optional<U> &&)
@@ -46,8 +46,8 @@ namespace util
         // TODO: optional(U&& value);
 
         // Assignment
-        optional& operator=(const optional& rhs) = default;
-        optional& operator=(optional&& rhs);
+        optional& operator=(const optional&) = default;
+        optional& operator=(optional&&);
 
         // Observers
         T* operator-> ();
@@ -84,7 +84,7 @@ namespace util
 
     // Implementation //////////////////////////////////////////////////////////
     template<class T> optional<T>::optional(T &&v) noexcept
-        : m_holder(v)
+        : m_holder(std::move(v))
     {
     }
 
diff --git a/modules/gapi/include/opencv2/gapi/video.hpp b/modules/gapi/include/opencv2/gapi/video.hpp
index 7f90134e6d..10965b0aa6 100644
--- a/modules/gapi/include/opencv2/gapi/video.hpp
+++ b/modules/gapi/include/opencv2/gapi/video.hpp
@@ -16,6 +16,32 @@
  */
 
 namespace cv { namespace gapi {
+
+/** @brief Structure for the Kalman filter's initialization parameters.*/
+
+struct GAPI_EXPORTS KalmanParams
+{
+    // initial state
+
+    //! corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+    Mat state;
+    //! posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+    Mat errorCov;
+
+    // dynamic system description
+
+    //! state transition matrix (A)
+    Mat transitionMatrix;
+    //! measurement matrix (H)
+    Mat measurementMatrix;
+    //! process noise covariance matrix (Q)
+    Mat processNoiseCov;
+    //! measurement noise covariance matrix (R)
+    Mat measurementNoiseCov;
+    //! control matrix (B) (Optional: not used if there's no control)
+    Mat controlMatrix;
+};
+
 namespace  video
 {
 using GBuildPyrOutput  = std::tuple<GArray<GMat>, GScalar>;
@@ -62,6 +88,95 @@ G_TYPED_KERNEL(GCalcOptFlowLKForPyr,
         return std::make_tuple(empty_array_desc(), empty_array_desc(), empty_array_desc());
     }
 };
+
+enum BackgroundSubtractorType
+{
+    TYPE_BS_MOG2,
+    TYPE_BS_KNN
+};
+
+/** @brief Structure for the Background Subtractor operation's initialization parameters.*/
+
+struct BackgroundSubtractorParams
+{
+    //! Type of the Background Subtractor operation.
+    BackgroundSubtractorType operation = TYPE_BS_MOG2;
+
+    //! Length of the history.
+    int history = 500;
+
+    //! For MOG2: Threshold on the squared Mahalanobis distance between the pixel
+    //! and the model to decide whether a pixel is well described by
+    //! the background model.
+    //! For KNN: Threshold on the squared distance between the pixel and the sample
+    //! to decide whether a pixel is close to that sample.
+    double threshold = 16;
+
+    //! If true, the algorithm will detect shadows and mark them.
+    bool detectShadows = true;
+
+    //! The value between 0 and 1 that indicates how fast
+    //! the background model is learnt.
+    //! Negative parameter value makes the algorithm use some automatically
+    //! chosen learning rate.
+    double learningRate = -1;
+
+    //! default constructor
+    BackgroundSubtractorParams() {}
+
+    /** Full constructor
+    @param op MOG2/KNN Background Subtractor type.
+    @param histLength Length of the history.
+    @param thrshld For MOG2: Threshold on the squared Mahalanobis distance between
+    the pixel and the model to decide whether a pixel is well described by the background model.
+    For KNN: Threshold on the squared distance between the pixel and the sample to decide
+    whether a pixel is close to that sample.
+    @param detect If true, the algorithm will detect shadows and mark them. It decreases the
+    speed a bit, so if you do not need this feature, set the parameter to false.
+    @param lRate The value between 0 and 1 that indicates how fast the background model is learnt.
+    Negative parameter value makes the algorithm to use some automatically chosen learning rate.
+    */
+    BackgroundSubtractorParams(BackgroundSubtractorType op, int histLength,
+                               double thrshld, bool detect, double lRate) : operation(op),
+                                                                            history(histLength),
+                                                                            threshold(thrshld),
+                                                                            detectShadows(detect),
+                                                                            learningRate(lRate){}
+};
+
+G_TYPED_KERNEL(GBackgroundSubtractor, <GMat(GMat, BackgroundSubtractorParams)>,
+               "org.opencv.video.BackgroundSubtractor")
+{
+    static GMatDesc outMeta(const GMatDesc& in, const BackgroundSubtractorParams& bsParams)
+    {
+        GAPI_Assert(bsParams.history >= 0);
+        GAPI_Assert(bsParams.learningRate <= 1);
+        return in.withType(CV_8U, 1);
+    }
+};
+
+void checkParams(const cv::gapi::KalmanParams& kfParams,
+                 const cv::GMatDesc& measurement, const cv::GMatDesc& control = {});
+
+G_TYPED_KERNEL(GKalmanFilter, <GMat(GMat, GOpaque<bool>, GMat, KalmanParams)>,
+               "org.opencv.video.KalmanFilter")
+{
+    static GMatDesc outMeta(const GMatDesc& measurement, const GOpaqueDesc&,
+                            const GMatDesc& control, const KalmanParams& kfParams)
+    {
+        checkParams(kfParams, measurement, control);
+        return measurement.withSize(Size(1, kfParams.transitionMatrix.rows));
+    }
+};
+
+G_TYPED_KERNEL(GKalmanFilterNoControl, <GMat(GMat, GOpaque<bool>, KalmanParams)>, "org.opencv.video.KalmanFilterNoControl")
+{
+    static GMatDesc outMeta(const GMatDesc& measurement, const GOpaqueDesc&, const KalmanParams& kfParams)
+    {
+        checkParams(kfParams, measurement);
+        return measurement.withSize(Size(1, kfParams.transitionMatrix.rows));
+    }
+};
 } //namespace video
 
 //! @addtogroup gapi_video
@@ -83,8 +198,9 @@ G_TYPED_KERNEL(GCalcOptFlowLKForPyr,
 @param tryReuseInputImage put ROI of input image into the pyramid if possible. You can pass false
                           to force data copying.
 
-@return output pyramid.
-@return number of levels in constructed pyramid. Can be less than maxLevel.
+@return
+ - output pyramid.
+ - number of levels in constructed pyramid. Can be less than maxLevel.
  */
 GAPI_EXPORTS std::tuple<GArray<GMat>, GScalar>
 buildOpticalFlowPyramid(const GMat     &img,
@@ -131,11 +247,12 @@ by number of pixels in a window; if this value is less than minEigThreshold, the
 feature is filtered out and its flow is not processed, so it allows to remove bad points and get a
 performance boost.
 
-@return GArray of 2D points (with single-precision floating-point coordinates)
+@return
+ - GArray of 2D points (with single-precision floating-point coordinates)
 containing the calculated new positions of input features in the second image.
-@return status GArray (of unsigned chars); each element of the vector is set to 1 if
+ - status GArray (of unsigned chars); each element of the vector is set to 1 if
 the flow for the corresponding features has been found, otherwise, it is set to 0.
-@return GArray of errors (doubles); each element of the vector is set to an error for the
+ - GArray of errors (doubles); each element of the vector is set to an error for the
 corresponding feature, type of the error measure can be set in flags parameter; if the flow wasn't
 found then the error is not defined (use the status parameter to find such cases).
  */
@@ -169,8 +286,75 @@ calcOpticalFlowPyrLK(const GArray<GMat>    &prevPyr,
                            int              flags        = 0,
                            double           minEigThresh = 1e-4);
 
+/** @brief Gaussian Mixture-based or K-nearest neighbours-based Background/Foreground Segmentation Algorithm.
+The operation generates a foreground mask.
+
+@return Output image is foreground mask, i.e. 8-bit unsigned 1-channel (binary) matrix @ref CV_8UC1.
+
+@note Functional textual ID is "org.opencv.video.BackgroundSubtractor"
+
+@param src input image: Floating point frame is used without scaling and should be in range [0,255].
+@param bsParams Set of initialization parameters for Background Subtractor kernel.
+*/
+GAPI_EXPORTS GMat BackgroundSubtractor(const GMat& src, const cv::gapi::video::BackgroundSubtractorParams& bsParams);
+
+/** @brief Standard Kalman filter algorithm <http://en.wikipedia.org/wiki/Kalman_filter>.
+
+@note Functional textual ID is "org.opencv.video.KalmanFilter"
+
+@param measurement input matrix: 32-bit or 64-bit float 1-channel matrix containing measurements.
+@param haveMeasurement dynamic input flag that indicates whether we get measurements
+at a particular iteration .
+@param control input matrix: 32-bit or 64-bit float 1-channel matrix contains control data
+for changing dynamic system.
+@param kfParams Set of initialization parameters for Kalman filter kernel.
+
+@return Output matrix is predicted or corrected state. They can be 32-bit or 64-bit float
+1-channel matrix @ref CV_32FC1 or @ref CV_64FC1.
+
+@details If measurement matrix is given (haveMeasurements == true), corrected state will
+be returned which corresponds to the pipeline
+cv::KalmanFilter::predict(control) -> cv::KalmanFilter::correct(measurement).
+Otherwise, predicted state will be returned which corresponds to the call of
+cv::KalmanFilter::predict(control).
+@sa cv::KalmanFilter
+*/
+GAPI_EXPORTS GMat KalmanFilter(const GMat& measurement, const GOpaque<bool>& haveMeasurement,
+                               const GMat& control, const cv::gapi::KalmanParams& kfParams);
+
+/** @overload
+The case of Standard Kalman filter algorithm when there is no control in a dynamic system.
+In this case the controlMatrix is empty and control vector is absent.
+
+@note Function textual ID is "org.opencv.video.KalmanFilterNoControl"
+
+@param measurement input matrix: 32-bit or 64-bit float 1-channel matrix containing measurements.
+@param haveMeasurement dynamic input flag that indicates whether we get measurements
+at a particular iteration.
+@param kfParams Set of initialization parameters for Kalman filter kernel.
+
+@return Output matrix is predicted or corrected state. They can be 32-bit or 64-bit float
+1-channel matrix @ref CV_32FC1 or @ref CV_64FC1.
+
+@sa cv::KalmanFilter
+ */
+GAPI_EXPORTS GMat KalmanFilter(const GMat& measurement, const GOpaque<bool>& haveMeasurement,
+                               const cv::gapi::KalmanParams& kfParams);
+
 //! @} gapi_video
 } //namespace gapi
 } //namespace cv
 
+
+namespace cv { namespace detail {
+template<> struct CompileArgTag<cv::gapi::video::BackgroundSubtractorParams>
+{
+    static const char* tag()
+    {
+        return "org.opencv.video.background_substractor_params";
+    }
+};
+}  // namespace detail
+}  // namespace cv
+
 #endif // OPENCV_GAPI_VIDEO_HPP
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index e25328e64f..56a7e70d88 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -3,67 +3,140 @@
 
 #ifdef HAVE_OPENCV_GAPI
 
+#ifdef _MSC_VER
+#pragma warning(disable: 4503)  // "decorated name length exceeded"
+                                // on empty_meta(const cv::GMetaArgs&, const cv::GArgs&)
+#endif
+
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+#include <opencv2/gapi/python/python.hpp>
+
 // NB: Python wrapper replaces :: with _ for classes
-using gapi_GKernelPackage = cv::gapi::GKernelPackage;
-using gapi_GNetPackage = cv::gapi::GNetPackage;
-using gapi_ie_PyParams = cv::gapi::ie::PyParams;
+using gapi_GKernelPackage        = cv::gapi::GKernelPackage;
+using gapi_GNetPackage           = cv::gapi::GNetPackage;
+using gapi_ie_PyParams           = cv::gapi::ie::PyParams;
 using gapi_wip_IStreamSource_Ptr = cv::Ptr<cv::gapi::wip::IStreamSource>;
+using detail_ExtractArgsCallback = cv::detail::ExtractArgsCallback;
+using detail_ExtractMetaCallback = cv::detail::ExtractMetaCallback;
+
+// NB: Python wrapper generate T_U for T<U>
+// This behavior is only observed for inputs
+using GOpaque_bool    = cv::GOpaque<bool>;
+using GOpaque_int     = cv::GOpaque<int>;
+using GOpaque_double  = cv::GOpaque<double>;
+using GOpaque_float   = cv::GOpaque<double>;
+using GOpaque_string  = cv::GOpaque<std::string>;
+using GOpaque_Point2i = cv::GOpaque<cv::Point>;
+using GOpaque_Point2f = cv::GOpaque<cv::Point2f>;
+using GOpaque_Size    = cv::GOpaque<cv::Size>;
+using GOpaque_Rect    = cv::GOpaque<cv::Rect>;
+
+using GArray_bool    = cv::GArray<bool>;
+using GArray_int     = cv::GArray<int>;
+using GArray_double  = cv::GArray<double>;
+using GArray_float   = cv::GArray<double>;
+using GArray_string  = cv::GArray<std::string>;
+using GArray_Point2i = cv::GArray<cv::Point>;
+using GArray_Point2f = cv::GArray<cv::Point2f>;
+using GArray_Size    = cv::GArray<cv::Size>;
+using GArray_Rect    = cv::GArray<cv::Rect>;
+using GArray_Scalar  = cv::GArray<cv::Scalar>;
+using GArray_Mat     = cv::GArray<cv::Mat>;
+using GArray_GMat    = cv::GArray<cv::GMat>;
 
 // FIXME: Python wrapper generate code without namespace std,
 // so it cause error: "string wasn't declared"
 // WA: Create using
 using std::string;
 
-template<>
+template <>
 bool pyopencv_to(PyObject* obj, std::vector<GCompileArg>& value, const ArgInfo& info)
 {
     return pyopencv_to_generic_vec(obj, value, info);
 }
 
-template<>
+template <>
 PyObject* pyopencv_from(const std::vector<GCompileArg>& value)
 {
     return pyopencv_from_generic_vec(value);
 }
 
-template<>
+template <>
 bool pyopencv_to(PyObject* obj, GRunArgs& value, const ArgInfo& info)
 {
     return pyopencv_to_generic_vec(obj, value, info);
 }
 
-static PyObject* from_grunarg(const GRunArg& v)
+template<>
+PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
+{
+    switch (o.getKind())
+    {
+        case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from(o.rref<bool>());
+        case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from(o.rref<int>());
+        case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from(o.rref<double>());
+        case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from(o.rref<float>());
+        case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from(o.rref<std::string>());
+        case cv::detail::OpaqueKind::CV_POINT     : return pyopencv_from(o.rref<cv::Point>());
+        case cv::detail::OpaqueKind::CV_POINT2F   : return pyopencv_from(o.rref<cv::Point2f>());
+        case cv::detail::OpaqueKind::CV_SIZE      : return pyopencv_from(o.rref<cv::Size>());
+        case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from(o.rref<cv::Rect>());
+        case cv::detail::OpaqueKind::CV_UNKNOWN   : break;
+        case cv::detail::OpaqueKind::CV_UINT64    : break;
+        case cv::detail::OpaqueKind::CV_SCALAR    : break;
+        case cv::detail::OpaqueKind::CV_MAT       : break;
+        case cv::detail::OpaqueKind::CV_DRAW_PRIM : break;
+    }
+
+    PyErr_SetString(PyExc_TypeError, "Unsupported GOpaque type");
+    return NULL;
+};
+
+template <>
+PyObject* pyopencv_from(const cv::detail::VectorRef& v)
+{
+    switch (v.getKind())
+    {
+        case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from_generic_vec(v.rref<bool>());
+        case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from_generic_vec(v.rref<int>());
+        case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from_generic_vec(v.rref<double>());
+        case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from_generic_vec(v.rref<float>());
+        case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from_generic_vec(v.rref<std::string>());
+        case cv::detail::OpaqueKind::CV_POINT     : return pyopencv_from_generic_vec(v.rref<cv::Point>());
+        case cv::detail::OpaqueKind::CV_POINT2F   : return pyopencv_from_generic_vec(v.rref<cv::Point2f>());
+        case cv::detail::OpaqueKind::CV_SIZE      : return pyopencv_from_generic_vec(v.rref<cv::Size>());
+        case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from_generic_vec(v.rref<cv::Rect>());
+        case cv::detail::OpaqueKind::CV_SCALAR    : return pyopencv_from_generic_vec(v.rref<cv::Scalar>());
+        case cv::detail::OpaqueKind::CV_MAT       : return pyopencv_from_generic_vec(v.rref<cv::Mat>());
+        case cv::detail::OpaqueKind::CV_UNKNOWN   : break;
+        case cv::detail::OpaqueKind::CV_UINT64    : break;
+        case cv::detail::OpaqueKind::CV_DRAW_PRIM : break;
+    }
+
+    PyErr_SetString(PyExc_TypeError, "Unsupported GArray type");
+    return NULL;
+}
+
+template <>
+PyObject* pyopencv_from(const GRunArg& v)
 {
     switch (v.index())
     {
         case GRunArg::index_of<cv::Mat>():
-        {
-            const auto& m = util::get<cv::Mat>(v);
-            return pyopencv_from(m);
-        }
+            return pyopencv_from(util::get<cv::Mat>(v));
 
         case GRunArg::index_of<cv::Scalar>():
-        {
-            const auto& s = util::get<cv::Scalar>(v);
-            return pyopencv_from(s);
-        }
+            return pyopencv_from(util::get<cv::Scalar>(v));
+
         case GRunArg::index_of<cv::detail::VectorRef>():
-        {
-            const auto& vref = util::get<cv::detail::VectorRef>(v);
-            switch (vref.getKind())
-            {
-                case cv::detail::OpaqueKind::CV_POINT2F:
-                    return pyopencv_from(vref.rref<cv::Point2f>());
-                default:
-                    PyErr_SetString(PyExc_TypeError, "Unsupported kind for GArray");
-                    return NULL;
-            }
-        }
-        default:
-            PyErr_SetString(PyExc_TypeError, "Failed to unpack GRunArgs");
-            return NULL;
+            return pyopencv_from(util::get<cv::detail::VectorRef>(v));
+
+        case GRunArg::index_of<cv::detail::OpaqueRef>():
+            return pyopencv_from(util::get<cv::detail::OpaqueRef>(v));
     }
-    GAPI_Assert(false);
+
+    PyErr_SetString(PyExc_TypeError, "Failed to unpack GRunArgs");
+    return NULL;
 }
 
 template<>
@@ -74,7 +147,7 @@ PyObject* pyopencv_from(const GRunArgs& value)
     // NB: It doesn't make sense to return list with a single element
     if (n == 1)
     {
-        PyObject* item = from_grunarg(value[0]);
+        PyObject* item = pyopencv_from(value[0]);
         if(!item)
         {
             return NULL;
@@ -85,7 +158,7 @@ PyObject* pyopencv_from(const GRunArgs& value)
     PyObject* list = PyList_New(n);
     for(i = 0; i < n; ++i)
     {
-        PyObject* item = from_grunarg(value[i]);
+        PyObject* item = pyopencv_from(value[i]);
         if(!item)
         {
             Py_DECREF(list);
@@ -110,6 +183,26 @@ PyObject* pyopencv_from(const GMetaArgs& value)
     return pyopencv_from_generic_vec(value);
 }
 
+template <typename T>
+void pyopencv_to_with_check(PyObject* from, T& to, const std::string& msg = "")
+{
+    if (!pyopencv_to(from, to, ArgInfo("", false)))
+    {
+        cv::util::throw_error(std::logic_error(msg));
+    }
+}
+
+template <typename T>
+void pyopencv_to_generic_vec_with_check(PyObject* from,
+                                        std::vector<T>& to,
+                                        const std::string& msg = "")
+{
+    if (!pyopencv_to_generic_vec(from, to, ArgInfo("", false)))
+    {
+        cv::util::throw_error(std::logic_error(msg));
+    }
+}
+
 template <typename T>
 static PyObject* extract_proto_args(PyObject* py_args, PyObject* kw)
 {
@@ -117,6 +210,7 @@ static PyObject* extract_proto_args(PyObject* py_args, PyObject* kw)
 
     GProtoArgs args;
     Py_ssize_t size = PyTuple_Size(py_args);
+    args.reserve(size);
     for (int i = 0; i < size; ++i)
     {
         PyObject* item = PyTuple_GetItem(py_args, i);
@@ -128,9 +222,13 @@ static PyObject* extract_proto_args(PyObject* py_args, PyObject* kw)
         {
             args.emplace_back(reinterpret_cast<pyopencv_GMat_t*>(item)->v);
         }
-        else if (PyObject_TypeCheck(item, reinterpret_cast<PyTypeObject*>(pyopencv_GArrayP2f_TypePtr)))
+        else if (PyObject_TypeCheck(item, reinterpret_cast<PyTypeObject*>(pyopencv_GOpaqueT_TypePtr)))
         {
-            args.emplace_back(reinterpret_cast<pyopencv_GArrayP2f_t*>(item)->v.strip());
+            args.emplace_back(reinterpret_cast<pyopencv_GOpaqueT_t*>(item)->v.strip());
+        }
+        else if (PyObject_TypeCheck(item, reinterpret_cast<PyTypeObject*>(pyopencv_GArrayT_TypePtr)))
+        {
+            args.emplace_back(reinterpret_cast<pyopencv_GArrayT_t*>(item)->v.strip());
         }
         else
         {
@@ -152,63 +250,553 @@ static PyObject* pyopencv_cv_GOut(PyObject* , PyObject* py_args, PyObject* kw)
     return extract_proto_args<GProtoOutputArgs>(py_args, kw);
 }
 
-static PyObject* pyopencv_cv_gin(PyObject* , PyObject* py_args, PyObject* kw)
+static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::OpaqueKind kind)
 {
-    using namespace cv;
-
-    GRunArgs args;
-    Py_ssize_t size = PyTuple_Size(py_args);
-    for (int i = 0; i < size; ++i)
+#define HANDLE_CASE(T, O) case cv::detail::OpaqueKind::CV_##T:  \
+{                                                               \
+    O obj{};                                                    \
+    pyopencv_to_with_check(from, obj, "Failed to obtain " # O); \
+    return cv::detail::OpaqueRef{std::move(obj)};               \
+}
+#define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
+    switch (kind)
     {
-        PyObject* item = PyTuple_GetItem(py_args, i);
-        if (PyTuple_Check(item))
+        HANDLE_CASE(BOOL,    bool);
+        HANDLE_CASE(INT,     int);
+        HANDLE_CASE(DOUBLE,  double);
+        HANDLE_CASE(FLOAT,   float);
+        HANDLE_CASE(STRING,  std::string);
+        HANDLE_CASE(POINT,   cv::Point);
+        HANDLE_CASE(POINT2F, cv::Point2f);
+        HANDLE_CASE(SIZE,    cv::Size);
+        HANDLE_CASE(RECT,    cv::Rect);
+        UNSUPPORTED(UNKNOWN);
+        UNSUPPORTED(UINT64);
+        UNSUPPORTED(SCALAR);
+        UNSUPPORTED(MAT);
+        UNSUPPORTED(DRAW_PRIM);
+#undef HANDLE_CASE
+#undef UNSUPPORTED
+    }
+    util::throw_error(std::logic_error("Unsupported type for GOpaqueT"));
+}
+
+static cv::detail::VectorRef extract_vector_ref(PyObject* from, cv::detail::OpaqueKind kind)
+{
+#define HANDLE_CASE(T, O) case cv::detail::OpaqueKind::CV_##T:                        \
+{                                                                                     \
+    std::vector<O> obj;                                                               \
+    pyopencv_to_generic_vec_with_check(from, obj, "Failed to obtain vector of " # O); \
+    return cv::detail::VectorRef{std::move(obj)};                                     \
+}
+#define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
+    switch (kind)
+    {
+        HANDLE_CASE(BOOL,    bool);
+        HANDLE_CASE(INT,     int);
+        HANDLE_CASE(DOUBLE,  double);
+        HANDLE_CASE(FLOAT,   float);
+        HANDLE_CASE(STRING,  std::string);
+        HANDLE_CASE(POINT,   cv::Point);
+        HANDLE_CASE(POINT2F, cv::Point2f);
+        HANDLE_CASE(SIZE,    cv::Size);
+        HANDLE_CASE(RECT,    cv::Rect);
+        HANDLE_CASE(SCALAR,  cv::Scalar);
+        HANDLE_CASE(MAT,     cv::Mat);
+        UNSUPPORTED(UNKNOWN);
+        UNSUPPORTED(UINT64);
+        UNSUPPORTED(DRAW_PRIM);
+#undef HANDLE_CASE
+#undef UNSUPPORTED
+    }
+    util::throw_error(std::logic_error("Unsupported type for GArrayT"));
+}
+
+static cv::GRunArg extract_run_arg(const cv::GTypeInfo& info, PyObject* item)
+{
+    switch (info.shape)
+    {
+        case cv::GShape::GMAT:
         {
-            cv::Scalar s;
-            if (pyopencv_to(item, s, ArgInfo("scalar", false)))
+            // NB: In case streaming it can be IStreamSource or cv::Mat
+            if (PyObject_TypeCheck(item,
+                        reinterpret_cast<PyTypeObject*>(pyopencv_gapi_wip_IStreamSource_TypePtr)))
             {
-                args.emplace_back(s);
-            }
-            else
-            {
-                PyErr_SetString(PyExc_TypeError, "Failed convert tuple to cv::Scalar");
-                return NULL;
+                cv::gapi::wip::IStreamSource::Ptr source =
+                    reinterpret_cast<pyopencv_gapi_wip_IStreamSource_t*>(item)->v;
+                return source;
             }
+            cv::Mat obj;
+            pyopencv_to_with_check(item, obj, "Failed to obtain cv::Mat");
+            return obj;
         }
-        else if (PyArray_Check(item))
+        case cv::GShape::GSCALAR:
         {
-            cv::Mat m;
-            if (pyopencv_to(item, m, ArgInfo("mat", false)))
-            {
-                args.emplace_back(m);
-            }
-            else
-            {
-                PyErr_SetString(PyExc_TypeError, "Failed convert array to cv::Mat");
-                return NULL;
-            }
+            cv::Scalar obj;
+            pyopencv_to_with_check(item, obj, "Failed to obtain cv::Scalar");
+            return obj;
         }
-        else if (PyObject_TypeCheck(item,
-                    reinterpret_cast<PyTypeObject*>(pyopencv_gapi_wip_IStreamSource_TypePtr)))
+        case cv::GShape::GOPAQUE:
         {
-            cv::gapi::wip::IStreamSource::Ptr source =
-                reinterpret_cast<pyopencv_gapi_wip_IStreamSource_t*>(item)->v;
-            args.emplace_back(source);
+            return extract_opaque_ref(item, info.kind);
         }
-        else
+        case cv::GShape::GARRAY:
         {
-            PyErr_SetString(PyExc_TypeError, "cv.gin can works only with cv::Mat,"
-                                             "cv::Scalar, cv::gapi::wip::IStreamSource::Ptr");
-            return NULL;
+            return extract_vector_ref(item, info.kind);
+        }
+        case cv::GShape::GFRAME:
+        {
+            // NB: Isn't supported yet.
+            break;
         }
     }
 
-    return pyopencv_from_generic_vec(args);
+    util::throw_error(std::logic_error("Unsupported output shape"));
 }
 
-static PyObject* pyopencv_cv_gout(PyObject* o, PyObject* py_args, PyObject* kw)
+static cv::GRunArgs extract_run_args(const cv::GTypesInfo& info, PyObject* py_args)
 {
-    return pyopencv_cv_gin(o, py_args, kw);
+    cv::GRunArgs args;
+    Py_ssize_t tuple_size = PyTuple_Size(py_args);
+    args.reserve(tuple_size);
+
+    for (int i = 0; i < tuple_size; ++i)
+    {
+        args.push_back(extract_run_arg(info[i], PyTuple_GetItem(py_args, i)));
+    }
+
+    return args;
 }
 
+static cv::GMetaArg extract_meta_arg(const cv::GTypeInfo& info, PyObject* item)
+{
+    switch (info.shape)
+    {
+        case cv::GShape::GMAT:
+        {
+            cv::Mat obj;
+            pyopencv_to_with_check(item, obj, "Failed to obtain cv::Mat");
+            return cv::GMetaArg{cv::descr_of(obj)};
+        }
+        case cv::GShape::GSCALAR:
+        {
+            cv::Scalar obj;
+            pyopencv_to_with_check(item, obj, "Failed to obtain cv::Scalar");
+            return cv::GMetaArg{cv::descr_of(obj)};
+        }
+        case cv::GShape::GARRAY:
+        {
+            return cv::GMetaArg{cv::empty_array_desc()};
+        }
+        case cv::GShape::GOPAQUE:
+        {
+            return cv::GMetaArg{cv::empty_gopaque_desc()};
+        }
+        case cv::GShape::GFRAME:
+        {
+            // NB: Isn't supported yet.
+            break;
+        }
+    }
+    util::throw_error(std::logic_error("Unsupported output shape"));
+}
+
+static cv::GMetaArgs extract_meta_args(const cv::GTypesInfo& info, PyObject* py_args)
+{
+    cv::GMetaArgs metas;
+    Py_ssize_t tuple_size = PyTuple_Size(py_args);
+    metas.reserve(tuple_size);
+
+    for (int i = 0; i < tuple_size; ++i)
+    {
+        metas.push_back(extract_meta_arg(info[i], PyTuple_GetItem(py_args, i)));
+    }
+
+    return metas;
+}
+
+inline PyObject* extract_opaque_value(const cv::GArg& value)
+{
+    GAPI_Assert(value.kind != cv::detail::ArgKind::GOBJREF);
+#define HANDLE_CASE(T, O) case cv::detail::OpaqueKind::CV_##T:  \
+    {                                                           \
+        return pyopencv_from(value.get<O>());                   \
+    }
+
+#define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
+    switch (value.opaque_kind)
+    {
+        HANDLE_CASE(BOOL,    bool);
+        HANDLE_CASE(INT,     int);
+        HANDLE_CASE(DOUBLE,  double);
+        HANDLE_CASE(FLOAT,   float);
+        HANDLE_CASE(STRING,  std::string);
+        HANDLE_CASE(POINT,   cv::Point);
+        HANDLE_CASE(POINT2F, cv::Point2f);
+        HANDLE_CASE(SIZE,    cv::Size);
+        HANDLE_CASE(RECT,    cv::Rect);
+        HANDLE_CASE(SCALAR,  cv::Scalar);
+        HANDLE_CASE(MAT,     cv::Mat);
+        UNSUPPORTED(UNKNOWN);
+        UNSUPPORTED(UINT64);
+        UNSUPPORTED(DRAW_PRIM);
+#undef HANDLE_CASE
+#undef UNSUPPORTED
+    }
+    util::throw_error(std::logic_error("Unsupported kernel input type"));
+}
+
+static cv::GRunArgs run_py_kernel(PyObject* kernel,
+                                  const cv::gapi::python::GPythonContext &ctx)
+{
+    const auto& ins      = ctx.ins;
+    const auto& in_metas = ctx.in_metas;
+    const auto& out_info = ctx.out_info;
+
+    PyGILState_STATE gstate;
+    gstate = PyGILState_Ensure();
+
+    cv::GRunArgs outs;
+    try
+    {
+        int in_idx = 0;
+        PyObject* args = PyTuple_New(ins.size());
+        for (size_t i = 0; i < ins.size(); ++i)
+        {
+            // NB: If meta is monostate then object isn't associated with G-TYPE, so in case it
+            // kind matches with supported types do conversion from c++ to python, if not (CV_UNKNOWN)
+            // obtain PyObject* and pass as-is.
+            if (cv::util::holds_alternative<cv::util::monostate>(in_metas[i]))
+            {
+                PyTuple_SetItem(args, i,
+                        ins[i].opaque_kind != cv::detail::OpaqueKind::CV_UNKNOWN ? extract_opaque_value(ins[i])
+                                                                                 : ins[i].get<PyObject*>());
+                continue;
+            }
+
+            switch (in_metas[i].index())
+            {
+                case cv::GMetaArg::index_of<cv::GMatDesc>():
+                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::Mat>()));
+                    break;
+                case cv::GMetaArg::index_of<cv::GScalarDesc>():
+                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::Scalar>()));
+                    break;
+                case cv::GMetaArg::index_of<cv::GOpaqueDesc>():
+                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::detail::OpaqueRef>()));
+                    break;
+                case cv::GMetaArg::index_of<cv::GArrayDesc>():
+                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::detail::VectorRef>()));
+                    break;
+                case cv::GMetaArg::index_of<cv::GFrameDesc>():
+                    util::throw_error(std::logic_error("GFrame isn't supported for custom operation"));
+                    break;
+            }
+            ++in_idx;
+        }
+
+        PyObject* result = PyObject_CallObject(kernel, args);
+
+        outs = out_info.size() == 1 ? cv::GRunArgs{extract_run_arg(out_info[0], result)}
+                                    : extract_run_args(out_info, result);
+    }
+    catch (...)
+    {
+        PyGILState_Release(gstate);
+        throw;
+    }
+    PyGILState_Release(gstate);
+
+    return outs;
+}
+
+// FIXME: Now it's impossible to obtain meta function from operation,
+// because kernel connects to operation only by id (string).
+static cv::GMetaArgs empty_meta(const cv::GMetaArgs &, const cv::GArgs &) {
+    return {};
+}
+
+static GMetaArg get_meta_arg(PyObject* obj)
+{
+    if (PyObject_TypeCheck(obj,
+                reinterpret_cast<PyTypeObject*>(pyopencv_GMatDesc_TypePtr)))
+    {
+        return cv::GMetaArg{reinterpret_cast<pyopencv_GMatDesc_t*>(obj)->v};
+    }
+    else if (PyObject_TypeCheck(obj,
+                reinterpret_cast<PyTypeObject*>(pyopencv_GScalarDesc_TypePtr)))
+    {
+        return cv::GMetaArg{reinterpret_cast<pyopencv_GScalarDesc_t*>(obj)->v};
+    }
+    else if (PyObject_TypeCheck(obj,
+                reinterpret_cast<PyTypeObject*>(pyopencv_GArrayDesc_TypePtr)))
+    {
+        return cv::GMetaArg{reinterpret_cast<pyopencv_GArrayDesc_t*>(obj)->v};
+    }
+    else if (PyObject_TypeCheck(obj,
+                reinterpret_cast<PyTypeObject*>(pyopencv_GOpaqueDesc_TypePtr)))
+    {
+        return cv::GMetaArg{reinterpret_cast<pyopencv_GOpaqueDesc_t*>(obj)->v};
+    }
+    else
+    {
+        util::throw_error(std::logic_error("Unsupported output meta type"));
+    }
+}
+
+static cv::GMetaArgs get_meta_args(PyObject* tuple)
+{
+    size_t size = PyTuple_Size(tuple);
+
+    cv::GMetaArgs metas;
+    metas.reserve(size);
+    for (size_t i = 0; i < size; ++i)
+    {
+        metas.push_back(get_meta_arg(PyTuple_GetItem(tuple, i)));
+    }
+
+    return metas;
+}
+
+static GMetaArgs python_meta(PyObject* outMeta, const cv::GMetaArgs &meta, const cv::GArgs &gargs) {
+    PyGILState_STATE gstate;
+    gstate = PyGILState_Ensure();
+
+    cv::GMetaArgs out_metas;
+    try
+    {
+        PyObject* args = PyTuple_New(meta.size());
+        size_t idx = 0;
+        for (auto&& m : meta)
+        {
+            switch (m.index())
+            {
+                case cv::GMetaArg::index_of<cv::GMatDesc>():
+                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GMatDesc>(m)));
+                    break;
+                case cv::GMetaArg::index_of<cv::GScalarDesc>():
+                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GScalarDesc>(m)));
+                    break;
+                case cv::GMetaArg::index_of<cv::GArrayDesc>():
+                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GArrayDesc>(m)));
+                    break;
+                case cv::GMetaArg::index_of<cv::GOpaqueDesc>():
+                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GOpaqueDesc>(m)));
+                    break;
+                case cv::GMetaArg::index_of<cv::util::monostate>():
+                    PyTuple_SetItem(args, idx, gargs[idx].get<PyObject*>());
+                    break;
+                case cv::GMetaArg::index_of<cv::GFrameDesc>():
+                    util::throw_error(std::logic_error("GFrame isn't supported for custom operation"));
+                    break;
+            }
+            ++idx;
+        }
+        PyObject* result = PyObject_CallObject(outMeta, args);
+        out_metas = PyTuple_Check(result) ? get_meta_args(result)
+                                          : cv::GMetaArgs{get_meta_arg(result)};
+    }
+    catch (...)
+    {
+        PyGILState_Release(gstate);
+        throw;
+    }
+    PyGILState_Release(gstate);
+
+    return out_metas;
+}
+
+static PyObject* pyopencv_cv_gapi_kernels(PyObject* , PyObject* py_args, PyObject*)
+{
+    using namespace cv;
+    gapi::GKernelPackage pkg;
+    Py_ssize_t size = PyTuple_Size(py_args);
+    for (int i = 0; i < size; ++i)
+    {
+        PyObject* pair   = PyTuple_GetItem(py_args, i);
+        PyObject* kernel = PyTuple_GetItem(pair, 0);
+
+        std::string id;
+        if (!pyopencv_to(PyTuple_GetItem(pair, 1), id, ArgInfo("id", false)))
+        {
+            PyErr_SetString(PyExc_TypeError, "Failed to obtain: kernel id must be a string");
+            return NULL;
+        }
+        Py_INCREF(kernel);
+        gapi::python::GPythonFunctor f(id.c_str(),
+                                       empty_meta,
+                                       std::bind(run_py_kernel,
+                                                 kernel,
+                                                 std::placeholders::_1));
+        pkg.include(f);
+    }
+    return pyopencv_from(pkg);
+}
+
+static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
+{
+    using namespace cv;
+    Py_ssize_t size = PyTuple_Size(py_args);
+    std::string id;
+    if (!pyopencv_to(PyTuple_GetItem(py_args, 0), id, ArgInfo("id", false)))
+    {
+        PyErr_SetString(PyExc_TypeError, "Failed to obtain: operation id must be a string");
+        return NULL;
+    }
+    PyObject* outMeta = PyTuple_GetItem(py_args, 1);
+    Py_INCREF(outMeta);
+
+    cv::GArgs args;
+    for (int i = 2; i < size; i++)
+    {
+        PyObject* item = PyTuple_GetItem(py_args, i);
+        if (PyObject_TypeCheck(item,
+                    reinterpret_cast<PyTypeObject*>(pyopencv_GMat_TypePtr)))
+        {
+            args.emplace_back(reinterpret_cast<pyopencv_GMat_t*>(item)->v);
+        }
+        else if (PyObject_TypeCheck(item,
+                           reinterpret_cast<PyTypeObject*>(pyopencv_GScalar_TypePtr)))
+        {
+            args.emplace_back(reinterpret_cast<pyopencv_GScalar_t*>(item)->v);
+        }
+        else if (PyObject_TypeCheck(item,
+                           reinterpret_cast<PyTypeObject*>(pyopencv_GOpaqueT_TypePtr)))
+        {
+            auto&& arg = reinterpret_cast<pyopencv_GOpaqueT_t*>(item)->v.arg();
+#define HC(T, K) case cv::GOpaqueT::Storage:: index_of<cv::GOpaque<T>>(): \
+            args.emplace_back(cv::util::get<cv::GOpaque<T>>(arg));        \
+            break;                                                        \
+
+            SWITCH(arg.index(), GOPAQUE_TYPE_LIST_G, HC)
+#undef HC
+        }
+        else if (PyObject_TypeCheck(item,
+                           reinterpret_cast<PyTypeObject*>(pyopencv_GArrayT_TypePtr)))
+        {
+            auto&& arg = reinterpret_cast<pyopencv_GArrayT_t*>(item)->v.arg();
+#define HC(T, K) case cv::GArrayT::Storage:: index_of<cv::GArray<T>>(): \
+            args.emplace_back(cv::util::get<cv::GArray<T>>(arg));       \
+            break;                                                      \
+
+            SWITCH(arg.index(), GARRAY_TYPE_LIST_G, HC)
+#undef HC
+        }
+        else
+        {
+            Py_INCREF(item);
+            args.emplace_back(cv::GArg(item));
+        }
+    }
+
+    cv::GKernel::M outMetaWrapper = std::bind(python_meta,
+                                              outMeta,
+                                              std::placeholders::_1,
+                                              std::placeholders::_2);
+    return pyopencv_from(cv::gapi::wip::op(id, outMetaWrapper, std::move(args)));
+}
+
+static PyObject* pyopencv_cv_gin(PyObject*, PyObject* py_args, PyObject*)
+{
+    Py_INCREF(py_args);
+    auto callback = cv::detail::ExtractArgsCallback{[=](const cv::GTypesInfo& info)
+        {
+            PyGILState_STATE gstate;
+            gstate = PyGILState_Ensure();
+
+            cv::GRunArgs args;
+            try
+            {
+                args = extract_run_args(info, py_args);
+            }
+            catch (...)
+            {
+                PyGILState_Release(gstate);
+                throw;
+            }
+            PyGILState_Release(gstate);
+            return args;
+        }};
+
+    return pyopencv_from(callback);
+}
+
+static PyObject* pyopencv_cv_descr_of(PyObject*, PyObject* py_args, PyObject*)
+{
+    Py_INCREF(py_args);
+    auto callback = cv::detail::ExtractMetaCallback{[=](const cv::GTypesInfo& info)
+        {
+            PyGILState_STATE gstate;
+            gstate = PyGILState_Ensure();
+
+            cv::GMetaArgs args;
+            try
+            {
+                args = extract_meta_args(info, py_args);
+            }
+            catch (...)
+            {
+                PyGILState_Release(gstate);
+                throw;
+            }
+            PyGILState_Release(gstate);
+            return args;
+        }};
+    return pyopencv_from(callback);
+}
+
+template<typename T>
+struct PyOpenCV_Converter<cv::GArray<T>>
+{
+    static PyObject* from(const cv::GArray<T>& p)
+    {
+        return pyopencv_from(cv::GArrayT(p));
+    }
+    static bool to(PyObject *obj, cv::GArray<T>& value, const ArgInfo& info)
+    {
+        if (PyObject_TypeCheck(obj, reinterpret_cast<PyTypeObject*>(pyopencv_GArrayT_TypePtr)))
+        {
+            auto& array = reinterpret_cast<pyopencv_GArrayT_t*>(obj)->v;
+            try {
+                value = cv::util::get<cv::GArray<T>>(array.arg());
+            } catch (...) {
+                return false;
+            }
+            return true;
+        }
+        return false;
+    }
+};
+
+template<typename T>
+struct PyOpenCV_Converter<cv::GOpaque<T>>
+{
+    static PyObject* from(const cv::GOpaque<T>& p)
+    {
+        return pyopencv_from(cv::GOpaqueT(p));
+    }
+    static bool to(PyObject *obj, cv::GOpaque<T>& value, const ArgInfo& info)
+    {
+        if (PyObject_TypeCheck(obj, reinterpret_cast<PyTypeObject*>(pyopencv_GOpaqueT_TypePtr)))
+        {
+            auto& opaque = reinterpret_cast<pyopencv_GOpaqueT_t*>(obj)->v;
+            try {
+                value = cv::util::get<cv::GOpaque<T>>(opaque.arg());
+            } catch (...) {
+                return false;
+            }
+            return true;
+        }
+        return false;
+    }
+};
+
+
+// extend cv.gapi.wip. methods
+#define PYOPENCV_EXTRA_METHODS_GAPI_WIP \
+  {"kernels", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_kernels), "kernels(...) -> GKernelPackage"}, \
+  {"op", CV_PY_FN_WITH_KW_(pyopencv_cv_gapi_op, 0), "kernels(...) -> retval\n"}, \
+
+
 #endif  // HAVE_OPENCV_GAPI
 #endif  // OPENCV_GAPI_PYOPENCV_GAPI_HPP
diff --git a/modules/gapi/misc/python/python_bridge.hpp b/modules/gapi/misc/python/python_bridge.hpp
new file mode 100644
index 0000000000..51f0ca8ab0
--- /dev/null
+++ b/modules/gapi/misc/python/python_bridge.hpp
@@ -0,0 +1,327 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_PYTHON_BRIDGE_HPP
+#define OPENCV_GAPI_PYTHON_BRIDGE_HPP
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+
+#define ID(T, E)  T
+#define ID_(T, E) ID(T, E),
+
+#define WRAP_ARGS(T, E, G) \
+    G(T, E)
+
+#define SWITCH(type, LIST_G, HC) \
+    switch(type) { \
+        LIST_G(HC, HC)  \
+        default: \
+            GAPI_Assert(false && "Unsupported type"); \
+    }
+
+#define GARRAY_TYPE_LIST_G(G, G2) \
+WRAP_ARGS(bool        ,  cv::gapi::ArgType::CV_BOOL,    G) \
+WRAP_ARGS(int         ,  cv::gapi::ArgType::CV_INT,     G) \
+WRAP_ARGS(double      ,  cv::gapi::ArgType::CV_DOUBLE,  G) \
+WRAP_ARGS(float       ,  cv::gapi::ArgType::CV_FLOAT,   G) \
+WRAP_ARGS(std::string ,  cv::gapi::ArgType::CV_STRING,  G) \
+WRAP_ARGS(cv::Point   ,  cv::gapi::ArgType::CV_POINT,   G) \
+WRAP_ARGS(cv::Point2f ,  cv::gapi::ArgType::CV_POINT2F, G) \
+WRAP_ARGS(cv::Size    ,  cv::gapi::ArgType::CV_SIZE,    G) \
+WRAP_ARGS(cv::Rect    ,  cv::gapi::ArgType::CV_RECT,    G) \
+WRAP_ARGS(cv::Scalar  ,  cv::gapi::ArgType::CV_SCALAR,  G) \
+WRAP_ARGS(cv::Mat     ,  cv::gapi::ArgType::CV_MAT,     G) \
+WRAP_ARGS(cv::GMat    ,  cv::gapi::ArgType::CV_GMAT,    G2)
+
+#define GOPAQUE_TYPE_LIST_G(G, G2) \
+WRAP_ARGS(bool        ,  cv::gapi::ArgType::CV_BOOL,    G)  \
+WRAP_ARGS(int         ,  cv::gapi::ArgType::CV_INT,     G)  \
+WRAP_ARGS(double      ,  cv::gapi::ArgType::CV_DOUBLE,  G)  \
+WRAP_ARGS(float       ,  cv::gapi::ArgType::CV_FLOAT,   G)  \
+WRAP_ARGS(std::string ,  cv::gapi::ArgType::CV_STRING,  G)  \
+WRAP_ARGS(cv::Point   ,  cv::gapi::ArgType::CV_POINT,   G)  \
+WRAP_ARGS(cv::Point2f ,  cv::gapi::ArgType::CV_POINT2F, G)  \
+WRAP_ARGS(cv::Size    ,  cv::gapi::ArgType::CV_SIZE,    G)  \
+WRAP_ARGS(cv::Rect    ,  cv::gapi::ArgType::CV_RECT,    G2) \
+
+namespace cv {
+namespace gapi {
+
+// NB: cv.gapi.CV_BOOL in python
+enum ArgType {
+    CV_BOOL,
+    CV_INT,
+    CV_DOUBLE,
+    CV_FLOAT,
+    CV_STRING,
+    CV_POINT,
+    CV_POINT2F,
+    CV_SIZE,
+    CV_RECT,
+    CV_SCALAR,
+    CV_MAT,
+    CV_GMAT,
+};
+
+GAPI_EXPORTS_W inline cv::GInferOutputs infer(const String& name, const cv::GInferInputs& inputs)
+{
+    return infer<Generic>(name, inputs);
+}
+
+GAPI_EXPORTS_W inline GInferOutputs infer(const std::string& name,
+                                          const cv::GOpaque<cv::Rect>& roi,
+                                          const GInferInputs& inputs)
+{
+    return infer<Generic>(name, roi, inputs);
+}
+
+GAPI_EXPORTS_W inline GInferListOutputs infer(const std::string& name,
+                                              const cv::GArray<cv::Rect>& rois,
+                                              const GInferInputs& inputs)
+{
+    return infer<Generic>(name, rois, inputs);
+}
+
+GAPI_EXPORTS_W inline GInferListOutputs infer2(const std::string& name,
+                                               const cv::GMat in,
+                                               const GInferListInputs& inputs)
+{
+    return infer2<Generic>(name, in, inputs);
+}
+
+} // namespace gapi
+
+namespace detail {
+
+template <template <typename> class Wrapper, typename T>
+struct WrapType { using type = Wrapper<T>; };
+
+template <template <typename> class T, typename... Types>
+using MakeVariantType = cv::util::variant<typename WrapType<T, Types>::type...>;
+
+template<typename T> struct ArgTypeTraits;
+
+#define DEFINE_TYPE_TRAITS(T, E) \
+template <> \
+struct ArgTypeTraits<T> { \
+    static constexpr const cv::gapi::ArgType type = E; \
+}; \
+
+GARRAY_TYPE_LIST_G(DEFINE_TYPE_TRAITS, DEFINE_TYPE_TRAITS)
+
+} // namespace detail
+
+class GAPI_EXPORTS_W_SIMPLE GOpaqueT
+{
+public:
+    GOpaqueT() = default;
+    using Storage = cv::detail::MakeVariantType<cv::GOpaque, GOPAQUE_TYPE_LIST_G(ID_, ID)>;
+
+    template<typename T>
+    GOpaqueT(cv::GOpaque<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+
+    GAPI_WRAP GOpaqueT(gapi::ArgType type) : m_type(type)
+    {
+
+#define HC(T, K) case K: \
+        m_arg = cv::GOpaque<T>(); \
+        break;
+
+        SWITCH(type, GOPAQUE_TYPE_LIST_G, HC)
+#undef HC
+    }
+
+    cv::detail::GOpaqueU strip() {
+#define HC(T, K) case Storage:: index_of<cv::GOpaque<T>>(): \
+        return cv::util::get<cv::GOpaque<T>>(m_arg).strip(); \
+
+        SWITCH(m_arg.index(), GOPAQUE_TYPE_LIST_G, HC)
+#undef HC
+
+            GAPI_Assert(false);
+    }
+
+    GAPI_WRAP gapi::ArgType type() { return m_type; }
+    const Storage& arg() const     { return m_arg;  }
+
+private:
+    gapi::ArgType m_type;
+    Storage m_arg;
+};
+
+class GAPI_EXPORTS_W_SIMPLE GArrayT
+{
+public:
+    GArrayT() = default;
+    using Storage = cv::detail::MakeVariantType<cv::GArray, GARRAY_TYPE_LIST_G(ID_, ID)>;
+
+    template<typename T>
+    GArrayT(cv::GArray<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+
+    GAPI_WRAP GArrayT(gapi::ArgType type) : m_type(type)
+    {
+
+#define HC(T, K) case K: \
+        m_arg = cv::GArray<T>(); \
+        break;
+
+        SWITCH(type, GARRAY_TYPE_LIST_G, HC)
+#undef HC
+    }
+
+    cv::detail::GArrayU strip() {
+#define HC(T, K) case Storage:: index_of<cv::GArray<T>>(): \
+        return cv::util::get<cv::GArray<T>>(m_arg).strip(); \
+
+        SWITCH(m_arg.index(), GARRAY_TYPE_LIST_G, HC)
+#undef HC
+
+        GAPI_Assert(false);
+    }
+
+    GAPI_WRAP gapi::ArgType type() { return m_type; }
+    const Storage& arg() const     { return m_arg;  }
+
+private:
+    gapi::ArgType m_type;
+    Storage m_arg;
+};
+
+namespace gapi {
+namespace wip {
+
+class GAPI_EXPORTS_W_SIMPLE GOutputs
+{
+public:
+    GOutputs() = default;
+    GOutputs(const std::string& id, cv::GKernel::M outMeta, cv::GArgs &&ins);
+
+    GAPI_WRAP cv::GMat     getGMat();
+    GAPI_WRAP cv::GScalar  getGScalar();
+    GAPI_WRAP cv::GArrayT  getGArray(cv::gapi::ArgType type);
+    GAPI_WRAP cv::GOpaqueT getGOpaque(cv::gapi::ArgType type);
+
+private:
+    class Priv;
+    std::shared_ptr<Priv> m_priv;
+};
+
+GOutputs op(const std::string& id, cv::GKernel::M outMeta, cv::GArgs&& args);
+
+template <typename... T>
+GOutputs op(const std::string& id, cv::GKernel::M outMeta, T&&... args)
+{
+    return op(id, outMeta, cv::GArgs{cv::GArg(std::forward<T>(args))... });
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+cv::gapi::wip::GOutputs cv::gapi::wip::op(const std::string& id,
+                                          cv::GKernel::M outMeta,
+                                          cv::GArgs&& args)
+{
+    cv::gapi::wip::GOutputs outputs{id, outMeta, std::move(args)};
+    return outputs;
+}
+
+class cv::gapi::wip::GOutputs::Priv
+{
+public:
+    Priv(const std::string& id, cv::GKernel::M outMeta, cv::GArgs &&ins);
+
+    cv::GMat     getGMat();
+    cv::GScalar  getGScalar();
+    cv::GArrayT  getGArray(cv::gapi::ArgType);
+    cv::GOpaqueT getGOpaque(cv::gapi::ArgType);
+
+private:
+    int output = 0;
+    std::unique_ptr<cv::GCall> m_call;
+};
+
+cv::gapi::wip::GOutputs::Priv::Priv(const std::string& id, cv::GKernel::M outMeta, cv::GArgs &&args)
+{
+    cv::GKinds kinds;
+    kinds.reserve(args.size());
+    std::transform(args.begin(), args.end(), std::back_inserter(kinds),
+            [](const cv::GArg& arg) { return arg.opaque_kind; });
+
+    m_call.reset(new cv::GCall{cv::GKernel{id, {}, outMeta, {}, std::move(kinds), {}}});
+    m_call->setArgs(std::move(args));
+}
+
+cv::GMat cv::gapi::wip::GOutputs::Priv::getGMat()
+{
+    m_call->kernel().outShapes.push_back(cv::GShape::GMAT);
+    // ...so _empty_ constructor is passed here.
+    m_call->kernel().outCtors.emplace_back(cv::util::monostate{});
+    return m_call->yield(output++);
+}
+
+cv::GScalar cv::gapi::wip::GOutputs::Priv::getGScalar()
+{
+    m_call->kernel().outShapes.push_back(cv::GShape::GSCALAR);
+    // ...so _empty_ constructor is passed here.
+    m_call->kernel().outCtors.emplace_back(cv::util::monostate{});
+    return m_call->yieldScalar(output++);
+}
+
+cv::GArrayT cv::gapi::wip::GOutputs::Priv::getGArray(cv::gapi::ArgType type)
+{
+    m_call->kernel().outShapes.push_back(cv::GShape::GARRAY);
+#define HC(T, K)                                                                                \
+    case K:                                                                                     \
+        m_call->kernel().outCtors.emplace_back(cv::detail::GObtainCtor<cv::GArray<T>>::get());  \
+        return cv::GArrayT(m_call->yieldArray<T>(output++));                                    \
+
+    SWITCH(type, GARRAY_TYPE_LIST_G, HC)
+#undef HC
+}
+
+cv::GOpaqueT cv::gapi::wip::GOutputs::Priv::getGOpaque(cv::gapi::ArgType type)
+{
+    m_call->kernel().outShapes.push_back(cv::GShape::GOPAQUE);
+#define HC(T, K)                                                                                \
+    case K:                                                                                     \
+        m_call->kernel().outCtors.emplace_back(cv::detail::GObtainCtor<cv::GOpaque<T>>::get()); \
+        return cv::GOpaqueT(m_call->yieldOpaque<T>(output++));                                  \
+
+    SWITCH(type, GOPAQUE_TYPE_LIST_G, HC)
+#undef HC
+}
+
+cv::gapi::wip::GOutputs::GOutputs(const std::string& id,
+                                  cv::GKernel::M outMeta,
+                                  cv::GArgs &&ins) :
+    m_priv(new cv::gapi::wip::GOutputs::Priv(id, outMeta, std::move(ins)))
+{
+}
+
+cv::GMat cv::gapi::wip::GOutputs::getGMat()
+{
+    return m_priv->getGMat();
+}
+
+cv::GScalar cv::gapi::wip::GOutputs::getGScalar()
+{
+    return m_priv->getGScalar();
+}
+
+cv::GArrayT cv::gapi::wip::GOutputs::getGArray(cv::gapi::ArgType type)
+{
+    return m_priv->getGArray(type);
+}
+
+cv::GOpaqueT cv::gapi::wip::GOutputs::getGOpaque(cv::gapi::ArgType type)
+{
+    return m_priv->getGOpaque(type);
+}
+
+#endif // OPENCV_GAPI_PYTHON_BRIDGE_HPP
diff --git a/modules/gapi/misc/python/shadow_gapi.hpp b/modules/gapi/misc/python/shadow_gapi.hpp
index 792314512c..bb82002069 100644
--- a/modules/gapi/misc/python/shadow_gapi.hpp
+++ b/modules/gapi/misc/python/shadow_gapi.hpp
@@ -7,6 +7,7 @@ namespace cv
 
    GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GKernelPackage pkg);
    GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GNetPackage pkg);
+   GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GKernelPackage kernels, gapi::GNetPackage nets);
 
    // NB: This classes doesn't exist in *.so
    // HACK: Mark them as a class to force python wrapper generate code for this entities
@@ -14,13 +15,47 @@ namespace cv
    class GAPI_EXPORTS_W_SIMPLE GProtoInputArgs { };
    class GAPI_EXPORTS_W_SIMPLE GProtoOutputArgs { };
    class GAPI_EXPORTS_W_SIMPLE GRunArg { };
-   class GAPI_EXPORTS_W_SIMPLE GMetaArg { };
-
-   class GAPI_EXPORTS_W_SIMPLE GArrayP2f { };
+   class GAPI_EXPORTS_W_SIMPLE GMetaArg { GAPI_WRAP GMetaArg(); };
 
    using GProtoInputArgs  = GIOProtoArgs<In_Tag>;
    using GProtoOutputArgs = GIOProtoArgs<Out_Tag>;
 
+   class GAPI_EXPORTS_W_SIMPLE GInferInputs
+   {
+   public:
+       GAPI_WRAP GInferInputs();
+       GAPI_WRAP void setInput(const std::string& name, const cv::GMat&   value);
+       GAPI_WRAP void setInput(const std::string& name, const cv::GFrame& value);
+   };
+
+   class GAPI_EXPORTS_W_SIMPLE GInferListInputs
+   {
+   public:
+       GAPI_WRAP GInferListInputs();
+       GAPI_WRAP void setInput(const std::string& name, const cv::GArray<cv::GMat>& value);
+       GAPI_WRAP void setInput(const std::string& name, const cv::GArray<cv::Rect>& value);
+   };
+
+   class GAPI_EXPORTS_W_SIMPLE GInferOutputs
+   {
+   public:
+       GAPI_WRAP GInferOutputs();
+       GAPI_WRAP cv::GMat at(const std::string& name);
+   };
+
+   class GAPI_EXPORTS_W_SIMPLE GInferListOutputs
+   {
+   public:
+       GAPI_WRAP GInferListOutputs();
+       GAPI_WRAP cv::GArray<cv::GMat> at(const std::string& name);
+   };
+
+   namespace detail
+   {
+       struct GAPI_EXPORTS_W_SIMPLE ExtractArgsCallback { };
+       struct GAPI_EXPORTS_W_SIMPLE ExtractMetaCallback { };
+   } // namespace detail
+
    namespace gapi
    {
        GAPI_EXPORTS_W gapi::GNetPackage networks(const cv::gapi::ie::PyParams& params);
diff --git a/modules/gapi/misc/python/test/test_gapi_core.py b/modules/gapi/misc/python/test/test_gapi_core.py
index 267037a78d..814d05d7cd 100644
--- a/modules/gapi/misc/python/test/test_gapi_core.py
+++ b/modules/gapi/misc/python/test/test_gapi_core.py
@@ -13,7 +13,7 @@ pkgs = [
           ('cpu'    , cv.gapi.core.cpu.kernels()),
           ('fluid'  , cv.gapi.core.fluid.kernels())
           # ('plaidml', cv.gapi.core.plaidml.kernels())
-      ]
+       ]
 
 
 class gapi_core_test(NewOpenCVTests):
@@ -127,6 +127,64 @@ class gapi_core_test(NewOpenCVTests):
             self.assertEqual(expected_thresh, actual_thresh[0],
                              'Failed on ' + pkg_name + ' backend')
 
+    def test_kmeans(self):
+        # K-means params
+        count    = 100
+        sz       = (count, 2)
+        in_mat   = np.random.random(sz).astype(np.float32)
+        K        = 5
+        flags    = cv.KMEANS_RANDOM_CENTERS
+        attempts = 1;
+        criteria = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0)
+
+        # G-API
+        g_in = cv.GMat()
+        compactness, out_labels, centers = cv.gapi.kmeans(g_in, K, criteria, attempts, flags)
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(compactness, out_labels, centers))
+
+        compact, labels, centers = comp.apply(cv.gin(in_mat))
+
+        # Assert
+        self.assertTrue(compact >= 0)
+        self.assertEqual(sz[0], labels.shape[0])
+        self.assertEqual(1, labels.shape[1])
+        self.assertTrue(labels.size != 0)
+        self.assertEqual(centers.shape[1], sz[1]);
+        self.assertEqual(centers.shape[0], K);
+        self.assertTrue(centers.size != 0);
+
+
+    def generate_random_points(self, sz):
+        arr = np.random.random(sz).astype(np.float32).T
+        return list(zip(arr[0], arr[1]))
+
+
+    def test_kmeans_2d(self):
+        # K-means 2D params
+        count     = 100
+        sz        = (count, 2)
+        amount    = sz[0]
+        K         = 5
+        flags     = cv.KMEANS_RANDOM_CENTERS
+        attempts  = 1;
+        criteria  = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0);
+        in_vector = self.generate_random_points(sz)
+        in_labels = []
+
+        # G-API
+        data        = cv.GArrayT(cv.gapi.CV_POINT2F)
+        best_labels = cv.GArrayT(cv.gapi.CV_INT)
+
+        compactness, out_labels, centers = cv.gapi.kmeans(data, K, best_labels, criteria, attempts, flags);
+        comp = cv.GComputation(cv.GIn(data, best_labels), cv.GOut(compactness, out_labels, centers));
+
+        compact, labels, centers = comp.apply(cv.gin(in_vector, in_labels));
+
+        # Assert
+        self.assertTrue(compact >= 0)
+        self.assertEqual(amount, len(labels))
+        self.assertEqual(K, len(centers))
+
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_imgproc.py b/modules/gapi/misc/python/test/test_gapi_imgproc.py
index dd1e397081..ed6f883fe5 100644
--- a/modules/gapi/misc/python/test/test_gapi_imgproc.py
+++ b/modules/gapi/misc/python/test/test_gapi_imgproc.py
@@ -50,7 +50,9 @@ class gapi_imgproc_test(NewOpenCVTests):
             # OpenCV - (num_points, 1, 2)
             # G-API  - (num_points, 2)
             # Comparison
-            self.assertEqual(0.0, cv.norm(expected.flatten(), actual.flatten(), cv.NORM_INF),
+            self.assertEqual(0.0, cv.norm(expected.flatten(),
+                                          np.array(actual, dtype=np.float32).flatten(),
+                                          cv.NORM_INF),
                              'Failed on ' + pkg_name + ' backend')
 
 
@@ -75,5 +77,30 @@ class gapi_imgproc_test(NewOpenCVTests):
                              'Failed on ' + pkg_name + ' backend')
 
 
+    def test_bounding_rect(self):
+        sz = 1280
+        fscale = 256
+
+        def sample_value(fscale):
+            return np.random.uniform(0, 255 * fscale) / fscale
+
+        points = np.array([(sample_value(fscale), sample_value(fscale)) for _ in range(1280)], np.float32)
+
+        # OpenCV
+        expected = cv.boundingRect(points)
+
+        # G-API
+        g_in  = cv.GMat()
+        g_out = cv.gapi.boundingRect(g_in)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+
+        for pkg_name, pkg in pkgs:
+            actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
+            # Comparison
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                             'Failed on ' + pkg_name + ' backend')
+
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_infer.py b/modules/gapi/misc/python/test/test_gapi_infer.py
index a6fabf7253..db048f5786 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer.py
@@ -9,25 +9,7 @@ from tests_common import NewOpenCVTests
 
 class test_gapi_infer(NewOpenCVTests):
 
-    def test_getAvailableTargets(self):
-        targets = cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_OPENCV)
-        self.assertTrue(cv.dnn.DNN_TARGET_CPU in targets)
-
-
-    def test_age_gender_infer(self):
-
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
-
-        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        img_path     = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        device_id    = 'CPU'
-        img          = cv.resize(cv.imread(img_path), (62,62))
-
-        # OpenCV DNN
+    def infer_reference_network(self, model_path, weights_path, img):
         net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
         net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
         net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
@@ -35,7 +17,28 @@ class test_gapi_infer(NewOpenCVTests):
         blob = cv.dnn.blobFromImage(img)
 
         net.setInput(blob)
-        dnn_age, dnn_gender = net.forward(net.getUnconnectedOutLayersNames())
+        return net.forward(net.getUnconnectedOutLayersNames())
+
+
+    def make_roi(self, img, roi):
+        return img[roi[1]:roi[1] + roi[3], roi[0]:roi[0] + roi[2], ...]
+
+
+    def test_age_gender_infer(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+
+        img_path  = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        img       = cv.resize(cv.imread(img_path), (62,62))
+
+        # OpenCV DNN
+        dnn_age, dnn_gender = self.infer_reference_network(model_path, weights_path, img)
 
         # OpenCV G-API
         g_in   = cv.GMat()
@@ -49,8 +52,6 @@ class test_gapi_infer(NewOpenCVTests):
         comp = cv.GComputation(cv.GIn(g_in), cv.GOut(age_g, gender_g))
         pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
 
-        nets = cv.gapi.networks(pp)
-        args = cv.compile_args(nets)
         gapi_age, gapi_gender = comp.apply(cv.gin(img), args=cv.compile_args(cv.gapi.networks(pp)))
 
         # Check
@@ -58,5 +59,263 @@ class test_gapi_infer(NewOpenCVTests):
         self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
 
 
+    def test_age_gender_infer_roi(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        img = cv.imread(img_path)
+        roi      = (10, 10, 62, 62)
+
+        # OpenCV DNN
+        dnn_age, dnn_gender = self.infer_reference_network(model_path,
+                                                           weights_path,
+                                                           self.make_roi(img, roi))
+
+        # OpenCV G-API
+        g_in   = cv.GMat()
+        g_roi  = cv.GOpaqueT(cv.gapi.CV_RECT)
+        inputs = cv.GInferInputs()
+        inputs.setInput('data', g_in)
+
+        outputs  = cv.gapi.infer("net", g_roi, inputs)
+        age_g    = outputs.at("age_conv3")
+        gender_g = outputs.at("prob")
+
+        comp = cv.GComputation(cv.GIn(g_in, g_roi), cv.GOut(age_g, gender_g))
+        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+        gapi_age, gapi_gender = comp.apply(cv.gin(img, roi), args=cv.compile_args(cv.gapi.networks(pp)))
+
+        # Check
+        self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+        self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+
+
+    def test_age_gender_infer_roi_list(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+
+        rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        img = cv.imread(img_path)
+
+        # OpenCV DNN
+        dnn_age_list    = []
+        dnn_gender_list = []
+        for roi in rois:
+            age, gender = self.infer_reference_network(model_path,
+                                                       weights_path,
+                                                       self.make_roi(img, roi))
+            dnn_age_list.append(age)
+            dnn_gender_list.append(gender)
+
+        # OpenCV G-API
+        g_in   = cv.GMat()
+        g_rois = cv.GArrayT(cv.gapi.CV_RECT)
+        inputs = cv.GInferInputs()
+        inputs.setInput('data', g_in)
+
+        outputs  = cv.gapi.infer("net", g_rois, inputs)
+        age_g    = outputs.at("age_conv3")
+        gender_g = outputs.at("prob")
+
+        comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
+        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+        gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
+                                                     args=cv.compile_args(cv.gapi.networks(pp)))
+
+        # Check
+        for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
+                                                              gapi_gender_list,
+                                                              dnn_age_list,
+                                                              dnn_gender_list):
+            self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+
+
+    def test_age_gender_infer2_roi(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+
+        rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        img = cv.imread(img_path)
+
+        # OpenCV DNN
+        dnn_age_list    = []
+        dnn_gender_list = []
+        for roi in rois:
+            age, gender = self.infer_reference_network(model_path,
+                                                       weights_path,
+                                                       self.make_roi(img, roi))
+            dnn_age_list.append(age)
+            dnn_gender_list.append(gender)
+
+        # OpenCV G-API
+        g_in   = cv.GMat()
+        g_rois = cv.GArrayT(cv.gapi.CV_RECT)
+        inputs = cv.GInferListInputs()
+        inputs.setInput('data', g_rois)
+
+        outputs  = cv.gapi.infer2("net", g_in, inputs)
+        age_g    = outputs.at("age_conv3")
+        gender_g = outputs.at("prob")
+
+        comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
+        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+        gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
+                                                     args=cv.compile_args(cv.gapi.networks(pp)))
+
+        # Check
+        for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
+                                                              gapi_gender_list,
+                                                              dnn_age_list,
+                                                              dnn_gender_list):
+            self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+
+
+
+    def test_person_detection_retail_0013(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+        img          = cv.resize(cv.imread(img_path), (544, 320))
+
+        # OpenCV DNN
+        net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
+        net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+        net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+
+        blob = cv.dnn.blobFromImage(img)
+
+        def parseSSD(detections, size):
+            h, w = size
+            bboxes = []
+            detections = detections.reshape(-1, 7)
+            for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
+                if confidence >= 0.5:
+                    x      = int(xmin * w)
+                    y      = int(ymin * h)
+                    width  = int(xmax * w - x)
+                    height = int(ymax * h - y)
+                    bboxes.append((x, y, width, height))
+
+            return bboxes
+
+        net.setInput(blob)
+        dnn_detections = net.forward()
+        dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
+
+        # OpenCV G-API
+        g_in   = cv.GMat()
+        inputs = cv.GInferInputs()
+        inputs.setInput('data', g_in)
+
+        g_sz       = cv.gapi.streaming.size(g_in)
+        outputs    = cv.gapi.infer("net", inputs)
+        detections = outputs.at("detection_out")
+        bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
+        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+        gapi_age, gapi_gender = comp.apply(cv.gin(img), args=cv.compile_args(cv.gapi.networks(pp)))
+
+        gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
+                                args=cv.compile_args(cv.gapi.networks(pp)))
+
+        # Comparison
+        self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
+                                      np.array(gapi_boxes).flatten(),
+                                      cv.NORM_INF))
+
+
+    def test_person_detection_retail_0013(self):
+        # NB: Check IE
+        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+            return
+
+        root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
+        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+        img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        device_id    = 'CPU'
+        img          = cv.resize(cv.imread(img_path), (544, 320))
+
+        # OpenCV DNN
+        net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
+        net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+        net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+
+        blob = cv.dnn.blobFromImage(img)
+
+        def parseSSD(detections, size):
+            h, w = size
+            bboxes = []
+            detections = detections.reshape(-1, 7)
+            for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
+                if confidence >= 0.5:
+                    x      = int(xmin * w)
+                    y      = int(ymin * h)
+                    width  = int(xmax * w - x)
+                    height = int(ymax * h - y)
+                    bboxes.append((x, y, width, height))
+
+            return bboxes
+
+        net.setInput(blob)
+        dnn_detections = net.forward()
+        dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
+
+        # OpenCV G-API
+        g_in   = cv.GMat()
+        inputs = cv.GInferInputs()
+        inputs.setInput('data', g_in)
+
+        g_sz       = cv.gapi.streaming.size(g_in)
+        outputs    = cv.gapi.infer("net", inputs)
+        detections = outputs.at("detection_out")
+        bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
+        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+        gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
+                                args=cv.compile_args(cv.gapi.networks(pp)))
+
+        # Comparison
+        self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
+                                      np.array(gapi_boxes).flatten(),
+                                      cv.NORM_INF))
+
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
index 53304fcb26..b4440e48c5 100644
--- a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
+++ b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
@@ -15,6 +15,138 @@ pkgs = [
          # ('plaidml', cv.gapi.core.plaidml.kernels())
      ]
 
+# Test output GMat.
+def custom_add(img1, img2, dtype):
+    return cv.add(img1, img2)
+
+# Test output GScalar.
+def custom_mean(img):
+    return cv.mean(img)
+
+# Test output tuple of GMat's.
+def custom_split3(img):
+    # NB: cv.split return list but g-api requires tuple in multiple output case
+    return tuple(cv.split(img))
+
+# Test output GOpaque.
+def custom_size(img):
+    # NB: Take only H, W, because the operation should return cv::Size which is 2D.
+    return img.shape[:2]
+
+# Test output GArray.
+def custom_goodFeaturesToTrack(img, max_corners, quality_lvl,
+                               min_distance, mask, block_sz,
+                               use_harris_detector, k):
+    features = cv.goodFeaturesToTrack(img, max_corners, quality_lvl,
+                                      min_distance, mask=mask,
+                                      blockSize=block_sz,
+                                      useHarrisDetector=use_harris_detector, k=k)
+    # NB: The operation output is cv::GArray<cv::Pointf>, so it should be mapped
+    # to python paramaters like this: [(1.2, 3.4), (5.2, 3.2)], because the cv::Point2f
+    # according to opencv rules mapped to the tuple and cv::GArray<> mapped to the list.
+    # OpenCV returns np.array with shape (n_features, 1, 2), so let's to convert it to list
+    # tuples with size - n_features.
+    features = list(map(tuple, features.reshape(features.shape[0], -1)))
+    return features
+
+# Test input scalar.
+def custom_addC(img, sc, dtype):
+    # NB: dtype is just ignored in this implementation.
+    # More over from G-API kernel got scalar as tuples with 4 elements
+    # where the last element is equal to zero, just cut him for broadcasting.
+    return img + np.array(sc, dtype=np.uint8)[:-1]
+
+
+# Test input opaque.
+def custom_sizeR(rect):
+    # NB: rect - is tuple (x, y, h, w)
+    return (rect[2], rect[3])
+
+# Test input array.
+def custom_boundingRect(array):
+    # NB: OpenCV - numpy array (n_points x 2).
+    #     G-API  - array of tuples (n_points).
+    return cv.boundingRect(np.array(array))
+
+# Test input mat
+def add(g_in1, g_in2, dtype):
+    def custom_add_meta(img_desc1, img_desc2, dtype):
+        return img_desc1
+
+    return cv.gapi.wip.op('custom.add', custom_add_meta, g_in1, g_in2, dtype).getGMat()
+
+
+# Test multiple output mat
+def split3(g_in):
+    def custom_split3_meta(img_desc):
+        out_desc = img_desc.withType(img_desc.depth, 1)
+        return out_desc, out_desc, out_desc
+
+    op = cv.gapi.wip.op('custom.split3', custom_split3_meta, g_in)
+
+    ch1 = op.getGMat()
+    ch2 = op.getGMat()
+    ch3 = op.getGMat()
+
+    return ch1, ch2, ch3
+
+# Test output scalar
+def mean(g_in):
+    def custom_mean_meta(img_desc):
+        return cv.empty_scalar_desc()
+
+    op = cv.gapi.wip.op('custom.mean', custom_mean_meta, g_in)
+    return op.getGScalar()
+
+
+# Test input scalar
+def addC(g_in, g_sc, dtype):
+    def custom_addC_meta(img_desc, sc_desc, dtype):
+        return img_desc
+
+    op = cv.gapi.wip.op('custom.addC', custom_addC_meta, g_in, g_sc, dtype)
+    return op.getGMat()
+
+
+# Test output opaque.
+def size(g_in):
+    def custom_size_meta(img_desc):
+        return cv.empty_gopaque_desc()
+
+    op = cv.gapi.wip.op('custom.size', custom_size_meta, g_in)
+    return op.getGOpaque(cv.gapi.CV_SIZE)
+
+
+# Test input opaque.
+def sizeR(g_rect):
+    def custom_sizeR_meta(opaque_desc):
+        return cv.empty_gopaque_desc()
+
+    op = cv.gapi.wip.op('custom.sizeR', custom_sizeR_meta, g_rect)
+    return op.getGOpaque(cv.gapi.CV_SIZE)
+
+
+# Test input array.
+def boundingRect(g_array):
+    def custom_boundingRect_meta(array_desc):
+        return cv.empty_gopaque_desc()
+
+    op = cv.gapi.wip.op('custom.boundingRect', custom_boundingRect_meta, g_array)
+    return op.getGOpaque(cv.gapi.CV_RECT)
+
+
+# Test output GArray.
+def goodFeaturesToTrack(g_in, max_corners, quality_lvl,
+                        min_distance, mask, block_sz,
+                        use_harris_detector, k):
+    def custom_goodFeaturesToTrack_meta(img_desc, max_corners, quality_lvl,
+                                        min_distance, mask, block_sz, use_harris_detector, k):
+        return cv.empty_array_desc()
+
+    op = cv.gapi.wip.op('custom.goodFeaturesToTrack', custom_goodFeaturesToTrack_meta, g_in,
+            max_corners, quality_lvl, min_distance, mask, block_sz, use_harris_detector, k)
+    return op.getGArray(cv.gapi.CV_POINT2F)
+
 
 class gapi_sample_pipelines(NewOpenCVTests):
 
@@ -40,5 +172,355 @@ class gapi_sample_pipelines(NewOpenCVTests):
                              'Failed on ' + pkg_name + ' backend')
 
 
+    def test_custom_mean(self):
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        in_mat = cv.imread(img_path)
+
+        # OpenCV
+        expected = cv.mean(in_mat)
+
+        # G-API
+        g_in = cv.GMat()
+        g_out = cv.gapi.mean(g_in)
+
+        comp = cv.GComputation(g_in, g_out)
+
+        pkg    = cv.gapi.wip.kernels((custom_mean, 'org.opencv.core.math.mean'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        # Comparison
+        self.assertEqual(expected, actual)
+
+
+    def test_custom_add(self):
+        sz = (3, 3)
+        in_mat1 = np.full(sz, 45, dtype=np.uint8)
+        in_mat2 = np.full(sz, 50 , dtype=np.uint8)
+
+        # OpenCV
+        expected = cv.add(in_mat1, in_mat2)
+
+        # G-API
+        g_in1 = cv.GMat()
+        g_in2 = cv.GMat()
+        g_out = cv.gapi.add(g_in1, g_in2)
+        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+
+        pkg = cv.gapi.wip.kernels((custom_add, 'org.opencv.core.math.add'))
+        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_size(self):
+        sz = (100, 150, 3)
+        in_mat = np.full(sz, 45, dtype=np.uint8)
+
+        # OpenCV
+        expected = (100, 150)
+
+        # G-API
+        g_in = cv.GMat()
+        g_sz = cv.gapi.streaming.size(g_in)
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
+
+        pkg = cv.gapi.wip.kernels((custom_size, 'org.opencv.streaming.size'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_goodFeaturesToTrack(self):
+        # G-API
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
+
+        # NB: goodFeaturesToTrack configuration
+        max_corners         = 50
+        quality_lvl         = 0.01
+        min_distance        = 10
+        block_sz            = 3
+        use_harris_detector = True
+        k                   = 0.04
+        mask                = None
+
+        # OpenCV
+        expected = cv.goodFeaturesToTrack(in_mat, max_corners, quality_lvl,
+                                          min_distance, mask=mask,
+                                          blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
+
+        # G-API
+        g_in = cv.GMat()
+        g_out = cv.gapi.goodFeaturesToTrack(g_in, max_corners, quality_lvl,
+                                            min_distance, mask, block_sz, use_harris_detector, k)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+        pkg = cv.gapi.wip.kernels((custom_goodFeaturesToTrack, 'org.opencv.imgproc.feature.goodFeaturesToTrack'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        # NB: OpenCV & G-API have different output types.
+        # OpenCV - numpy array with shape (num_points, 1, 2)
+        # G-API  - list of tuples with size - num_points
+        # Comparison
+        self.assertEqual(0.0, cv.norm(expected.flatten(),
+                                      np.array(actual, dtype=np.float32).flatten(), cv.NORM_INF))
+
+
+    def test_custom_addC(self):
+        sz = (3, 3, 3)
+        in_mat = np.full(sz, 45, dtype=np.uint8)
+        sc = (50, 10, 20)
+
+        # Numpy reference, make array from sc to keep uint8 dtype.
+        expected = in_mat + np.array(sc, dtype=np.uint8)
+
+        # G-API
+        g_in = cv.GMat()
+        g_sc = cv.GScalar()
+        g_out = cv.gapi.addC(g_in, g_sc)
+        comp = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
+
+        pkg = cv.gapi.wip.kernels((custom_addC, 'org.opencv.core.math.addC'))
+        actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_sizeR(self):
+        # x, y, h, w
+        roi = (10, 15, 100, 150)
+
+        expected = (100, 150)
+
+        # G-API
+        g_r  = cv.GOpaqueT(cv.gapi.CV_RECT)
+        g_sz = cv.gapi.streaming.size(g_r)
+        comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
+
+        pkg = cv.gapi.wip.kernels((custom_sizeR, 'org.opencv.streaming.sizeR'))
+        actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
+
+        # cv.norm works with tuples ?
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_boundingRect(self):
+        points = [(0,0), (0,1), (1,0), (1,1)]
+
+        # OpenCV
+        expected = cv.boundingRect(np.array(points))
+
+        # G-API
+        g_pts = cv.GArrayT(cv.gapi.CV_POINT)
+        g_br  = cv.gapi.boundingRect(g_pts)
+        comp = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
+
+        pkg = cv.gapi.wip.kernels((custom_boundingRect, 'org.opencv.imgproc.shape.boundingRectVector32S'))
+        actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
+
+        # cv.norm works with tuples ?
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_multiple_custom_kernels(self):
+        sz = (3, 3, 3)
+        in_mat1 = np.full(sz, 45, dtype=np.uint8)
+        in_mat2 = np.full(sz, 50 , dtype=np.uint8)
+
+        # OpenCV
+        expected = cv.mean(cv.split(cv.add(in_mat1, in_mat2))[1])
+
+        # G-API
+        g_in1 = cv.GMat()
+        g_in2 = cv.GMat()
+        g_sum = cv.gapi.add(g_in1, g_in2)
+        g_b, g_r, g_g = cv.gapi.split3(g_sum)
+        g_mean = cv.gapi.mean(g_b)
+
+        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_mean))
+
+
+        pkg = cv.gapi.wip.kernels((custom_add   , 'org.opencv.core.math.add'),
+                         (custom_mean  , 'org.opencv.core.math.mean'),
+                         (custom_split3, 'org.opencv.core.transform.split3'))
+
+        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_add(self):
+        sz = (3, 3)
+        in_mat1 = np.full(sz, 45, dtype=np.uint8)
+        in_mat2 = np.full(sz, 50, dtype=np.uint8)
+
+        # OpenCV
+        expected = cv.add(in_mat1, in_mat2)
+
+        # G-API
+        g_in1  = cv.GMat()
+        g_in2  = cv.GMat()
+        g_out = add(g_in1, g_in2, cv.CV_8UC1)
+
+        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+
+        pkg = cv.gapi.wip.kernels((custom_add, 'custom.add'))
+        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_split3(self):
+        sz = (4, 4)
+        in_ch1 = np.full(sz, 1, dtype=np.uint8)
+        in_ch2 = np.full(sz, 2, dtype=np.uint8)
+        in_ch3 = np.full(sz, 3, dtype=np.uint8)
+        # H x W x C
+        in_mat = np.stack((in_ch1, in_ch2, in_ch3), axis=2)
+
+        # G-API
+        g_in  = cv.GMat()
+        g_ch1, g_ch2, g_ch3 = split3(g_in)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_ch1, g_ch2, g_ch3))
+
+        pkg = cv.gapi.wip.kernels((custom_split3, 'custom.split3'))
+        ch1, ch2, ch3 = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(in_ch1, ch1, cv.NORM_INF))
+        self.assertEqual(0.0, cv.norm(in_ch2, ch2, cv.NORM_INF))
+        self.assertEqual(0.0, cv.norm(in_ch3, ch3, cv.NORM_INF))
+
+
+    def test_custom_op_mean(self):
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        in_mat = cv.imread(img_path)
+
+        # OpenCV
+        expected = cv.mean(in_mat)
+
+        # G-API
+        g_in  = cv.GMat()
+        g_out = mean(g_in)
+
+        comp = cv.GComputation(g_in, g_out)
+
+        pkg    = cv.gapi.wip.kernels((custom_mean, 'custom.mean'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        # Comparison
+        self.assertEqual(expected, actual)
+
+
+    def test_custom_op_addC(self):
+        sz = (3, 3, 3)
+        in_mat = np.full(sz, 45, dtype=np.uint8)
+        sc = (50, 10, 20)
+
+        # Numpy reference, make array from sc to keep uint8 dtype.
+        expected = in_mat + np.array(sc, dtype=np.uint8)
+
+        # G-API
+        g_in  = cv.GMat()
+        g_sc  = cv.GScalar()
+        g_out = addC(g_in, g_sc, cv.CV_8UC1)
+        comp  = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
+
+        pkg = cv.gapi.wip.kernels((custom_addC, 'custom.addC'))
+        actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_size(self):
+        sz = (100, 150, 3)
+        in_mat = np.full(sz, 45, dtype=np.uint8)
+
+        # Open_cV
+        expected = (100, 150)
+
+        # G-API
+        g_in = cv.GMat()
+        g_sz = size(g_in)
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
+
+        pkg = cv.gapi.wip.kernels((custom_size, 'custom.size'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_sizeR(self):
+        # x, y, h, w
+        roi = (10, 15, 100, 150)
+
+        expected = (100, 150)
+
+        # G-API
+        g_r  = cv.GOpaqueT(cv.gapi.CV_RECT)
+        g_sz = sizeR(g_r)
+        comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
+
+        pkg = cv.gapi.wip.kernels((custom_sizeR, 'custom.sizeR'))
+        actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
+
+        # cv.norm works with tuples ?
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_boundingRect(self):
+        points = [(0,0), (0,1), (1,0), (1,1)]
+
+        # OpenCV
+        expected = cv.boundingRect(np.array(points))
+
+        # G-API
+        g_pts = cv.GArrayT(cv.gapi.CV_POINT)
+        g_br  = boundingRect(g_pts)
+        comp = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
+
+        pkg = cv.gapi.wip.kernels((custom_boundingRect, 'custom.boundingRect'))
+        actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
+
+        # cv.norm works with tuples ?
+        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+    def test_custom_op_goodFeaturesToTrack(self):
+        # G-API
+        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+        in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
+
+        # NB: goodFeaturesToTrack configuration
+        max_corners         = 50
+        quality_lvl         = 0.01
+        min_distance        = 10
+        block_sz            = 3
+        use_harris_detector = True
+        k                   = 0.04
+        mask                = None
+
+        # OpenCV
+        expected = cv.goodFeaturesToTrack(in_mat, max_corners, quality_lvl,
+                                          min_distance, mask=mask,
+                                          blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
+
+        # G-API
+        g_in = cv.GMat()
+        g_out = goodFeaturesToTrack(g_in, max_corners, quality_lvl,
+                                    min_distance, mask, block_sz, use_harris_detector, k)
+
+        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+        pkg = cv.gapi.wip.kernels((custom_goodFeaturesToTrack, 'custom.goodFeaturesToTrack'))
+        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+        # NB: OpenCV & G-API have different output types.
+        # OpenCV - numpy array with shape (num_points, 1, 2)
+        # G-API  - list of tuples with size - num_points
+        # Comparison
+        self.assertEqual(0.0, cv.norm(expected.flatten(),
+                                      np.array(actual, dtype=np.float32).flatten(), cv.NORM_INF))
+
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_streaming.py b/modules/gapi/misc/python/test/test_gapi_streaming.py
index ae7ef5d338..5356abc76a 100644
--- a/modules/gapi/misc/python/test/test_gapi_streaming.py
+++ b/modules/gapi/misc/python/test/test_gapi_streaming.py
@@ -19,7 +19,7 @@ class test_gapi_streaming(NewOpenCVTests):
         g_in = cv.GMat()
         g_out = cv.gapi.medianBlur(g_in, 3)
         c = cv.GComputation(g_in, g_out)
-        ccomp = c.compileStreaming(cv.descr_of(cv.gin(in_mat)))
+        ccomp = c.compileStreaming(cv.descr_of(in_mat))
         ccomp.setSource(cv.gin(in_mat))
         ccomp.start()
 
@@ -191,12 +191,13 @@ class test_gapi_streaming(NewOpenCVTests):
                 # NB: OpenCV & G-API have different output shapes:
                 # OpenCV - (num_points, 1, 2)
                 # G-API  - (num_points, 2)
-                self.assertEqual(0.0, cv.norm(e.flatten(), a.flatten(), cv.NORM_INF))
+                self.assertEqual(0.0, cv.norm(e.flatten(),
+                                              np.array(a, np.float32).flatten(),
+                                              cv.NORM_INF))
 
             proc_num_frames += 1
             if proc_num_frames == max_num_frames:
                 break;
 
-
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_types.py b/modules/gapi/misc/python/test/test_gapi_types.py
new file mode 100644
index 0000000000..0f3b194a2f
--- /dev/null
+++ b/modules/gapi/misc/python/test/test_gapi_types.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import numpy as np
+import cv2 as cv
+import os
+
+from tests_common import NewOpenCVTests
+
+class gapi_types_test(NewOpenCVTests):
+
+    def test_garray_type(self):
+        types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
+                 cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
+                 cv.gapi.CV_RECT  , cv.gapi.CV_SCALAR, cv.gapi.CV_MAT    , cv.gapi.CV_GMAT]
+
+        for t in types:
+            g_array = cv.GArrayT(t)
+            self.assertEqual(t, g_array.type())
+
+
+    def test_gopaque_type(self):
+        types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
+                 cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
+                 cv.gapi.CV_RECT]
+
+        for t in types:
+            g_opaque = cv.GOpaqueT(t)
+            self.assertEqual(t, g_opaque.type())
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
index 4a88b4f59c..be88e5a721 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_CORE_PERF_TESTS_HPP
@@ -73,6 +73,12 @@ namespace opencv_test
     class ConcatVertVecPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
     class LUTPerfTest : public TestPerfParams<tuple<MatType, MatType, cv::Size, cv::GCompileArgs>> {};
     class ConvertToPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, double, double, cv::GCompileArgs>> {};
+    class KMeansNDPerfTest : public TestPerfParams<tuple<cv::Size, CompareMats, int,
+                                                         cv::KmeansFlags, cv::GCompileArgs>> {};
+    class KMeans2DPerfTest : public TestPerfParams<tuple<int, int, cv::KmeansFlags,
+                                                         cv::GCompileArgs>> {};
+    class KMeans3DPerfTest : public TestPerfParams<tuple<int, int, cv::KmeansFlags,
+                                                         cv::GCompileArgs>> {};
     class ResizePerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, cv::Size, cv::GCompileArgs>> {};
     class ResizeFxFyPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, double, double, cv::GCompileArgs>> {};
     class ParseSSDBLPerfTest : public TestPerfParams<tuple<cv::Size, float, int, cv::GCompileArgs>>, public ParserSSDTest {};
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
index ac90181184..7fe0ec4c26 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_CORE_PERF_TESTS_INL_HPP
@@ -12,6 +12,8 @@
 
 #include "gapi_core_perf_tests.hpp"
 
+#include "../../test/common/gapi_core_tests_common.hpp"
+
 namespace opencv_test
 {
 using namespace perf;
@@ -1905,6 +1907,135 @@ PERF_TEST_P_(ConvertToPerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(KMeansNDPerfTest, TestPerformance)
+{
+    cv::Size sz;
+    CompareMats cmpF;
+    int K = -1;
+    cv::KmeansFlags flags = cv::KMEANS_RANDOM_CENTERS;
+    cv::GCompileArgs compile_args;
+    std::tie(sz, cmpF, K, flags, compile_args) = GetParam();
+
+    MatType2 type = CV_32FC1;
+    initMatrixRandU(type, sz, -1, false);
+
+    double compact_gapi = -1.;
+    cv::Mat labels_gapi, centers_gapi;
+    if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+    {
+        const int amount = sz.height;
+        cv::Mat bestLabels(cv::Size{1, amount}, CV_32SC1);
+        cv::randu(bestLabels, 0, K);
+
+        cv::GComputation c(kmeansTestGAPI(in_mat1, bestLabels, K, flags, std::move(compile_args),
+                                          compact_gapi, labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_mat1, bestLabels),
+                    cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestOpenCVCompare(in_mat1, bestLabels, K, flags, compact_gapi, labels_gapi,
+                                centers_gapi, cmpF);
+    }
+    else
+    {
+        cv::GComputation c(kmeansTestGAPI(in_mat1, K, flags, std::move(compile_args), compact_gapi,
+                                          labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_mat1), cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestValidate(sz, type, K, compact_gapi, labels_gapi, centers_gapi);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(KMeans2DPerfTest, TestPerformance)
+{
+    int amount = -1;
+    int K = -1;
+    cv::KmeansFlags flags = cv::KMEANS_RANDOM_CENTERS;
+    cv::GCompileArgs compile_args;
+    std::tie(amount, K, flags, compile_args) = GetParam();
+
+    std::vector<cv::Point2f> in_vector{};
+    initPointsVectorRandU(amount, in_vector);
+
+    double compact_gapi = -1.;
+    std::vector<int> labels_gapi{};
+    std::vector<cv::Point2f> centers_gapi{};
+    if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+    {
+        std::vector<int> bestLabels(amount);
+        cv::randu(bestLabels, 0, K);
+
+        cv::GComputation c(kmeansTestGAPI(in_vector, bestLabels, K, flags, std::move(compile_args),
+                                          compact_gapi, labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_vector, bestLabels),
+                    cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestOpenCVCompare(in_vector, bestLabels, K, flags, compact_gapi, labels_gapi,
+                                centers_gapi);
+    }
+    else
+    {
+        cv::GComputation c(kmeansTestGAPI(in_vector, K, flags, std::move(compile_args),
+                                          compact_gapi, labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_vector), cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestValidate({-1, amount}, -1, K, compact_gapi, labels_gapi, centers_gapi);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(KMeans3DPerfTest, TestPerformance)
+{
+    int amount = -1;
+    int K = -1;
+    cv::KmeansFlags flags = cv::KMEANS_RANDOM_CENTERS;
+    cv::GCompileArgs compile_args;
+    std::tie(amount, K, flags, compile_args) = GetParam();
+
+    std::vector<cv::Point3f> in_vector{};
+    initPointsVectorRandU(amount, in_vector);
+
+    double compact_gapi = -1.;
+    std::vector<int> labels_gapi;
+    std::vector<cv::Point3f> centers_gapi;
+    if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+    {
+        std::vector<int> bestLabels(amount);
+        cv::randu(bestLabels, 0, K);
+
+        cv::GComputation c(kmeansTestGAPI(in_vector, bestLabels, K, flags, std::move(compile_args),
+                                          compact_gapi, labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_vector, bestLabels),
+                    cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestOpenCVCompare(in_vector, bestLabels, K, flags, compact_gapi, labels_gapi,
+                                centers_gapi);
+    }
+    else
+    {
+        cv::GComputation c(kmeansTestGAPI(in_vector, K, flags, std::move(compile_args),
+                                          compact_gapi, labels_gapi, centers_gapi));
+        TEST_CYCLE()
+        {
+            c.apply(cv::gin(in_vector), cv::gout(compact_gapi, labels_gapi, centers_gapi));
+        }
+        kmeansTestValidate({-1, amount}, -1, K, compact_gapi, labels_gapi, centers_gapi);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(ResizePerfTest, TestPerformance)
 {
     compare_f cmpF = get<0>(GetParam());
diff --git a/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp b/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp
index b2591907cf..16e5f13729 100644
--- a/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp
@@ -30,6 +30,8 @@ class ErodePerfTest           : public TestPerfParams<tuple<compare_f, MatType,i
 class Erode3x3PerfTest        : public TestPerfParams<tuple<compare_f, MatType,cv::Size,int, cv::GCompileArgs>> {};
 class DilatePerfTest          : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int, cv::GCompileArgs>> {};
 class Dilate3x3PerfTest       : public TestPerfParams<tuple<compare_f, MatType,cv::Size,int, cv::GCompileArgs>> {};
+class MorphologyExPerfTest    : public TestPerfParams<tuple<compare_f,MatType,cv::Size,
+                                                            cv::MorphTypes,cv::GCompileArgs>> {};
 class SobelPerfTest           : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,int,int, cv::GCompileArgs>> {};
 class SobelXYPerfTest         : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,int, cv::GCompileArgs>> {};
 class LaplacianPerfTest       : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,
@@ -41,11 +43,54 @@ class CannyPerfTest           : public TestPerfParams<tuple<compare_f, MatType,c
 class GoodFeaturesPerfTest    : public TestPerfParams<tuple<compare_vector_f<cv::Point2f>, std::string,
                                                             int,int,double,double,int,bool,
                                                             cv::GCompileArgs>> {};
+class FindContoursPerfTest    : public TestPerfParams<tuple<CompareMats, MatType,cv::Size,
+                                                            cv::RetrievalModes,
+                                                            cv::ContourApproximationModes,
+                                                            cv::GCompileArgs>> {};
+class FindContoursHPerfTest   : public TestPerfParams<tuple<CompareMats, MatType,cv::Size,
+                                                            cv::RetrievalModes,
+                                                            cv::ContourApproximationModes,
+                                                            cv::GCompileArgs>> {};
+class BoundingRectMatPerfTest       :
+    public TestPerfParams<tuple<CompareRects, MatType,cv::Size,bool, cv::GCompileArgs>> {};
+class BoundingRectVector32SPerfTest :
+    public TestPerfParams<tuple<CompareRects, cv::Size, cv::GCompileArgs>> {};
+class BoundingRectVector32FPerfTest :
+    public TestPerfParams<tuple<CompareRects, cv::Size, cv::GCompileArgs>> {};
+class FitLine2DMatVectorPerfTest : public TestPerfParams<tuple<CompareVecs<float, 4>,
+                                                               MatType,cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine2DVector32SPerfTest : public TestPerfParams<tuple<CompareVecs<float, 4>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine2DVector32FPerfTest : public TestPerfParams<tuple<CompareVecs<float, 4>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine2DVector64FPerfTest : public TestPerfParams<tuple<CompareVecs<float, 4>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine3DMatVectorPerfTest : public TestPerfParams<tuple<CompareVecs<float, 6>,
+                                                               MatType,cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine3DVector32SPerfTest : public TestPerfParams<tuple<CompareVecs<float, 6>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine3DVector32FPerfTest : public TestPerfParams<tuple<CompareVecs<float, 6>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
+class FitLine3DVector64FPerfTest : public TestPerfParams<tuple<CompareVecs<float, 6>,
+                                                               cv::Size,cv::DistanceTypes,
+                                                               cv::GCompileArgs>> {};
 class EqHistPerfTest      : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class BGR2RGBPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class RGB2GrayPerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class BGR2GrayPerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class RGB2YUVPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class YUV2RGBPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class BGR2I420PerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class RGB2I420PerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class I4202BGRPerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class I4202RGBPerfTest    : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class RGB2LabPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class BGR2LUVPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
 class LUV2BGRPerfTest     : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
diff --git a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
index f71e435a2b..57c8130338 100644
--- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
@@ -11,6 +11,8 @@
 
 #include "gapi_imgproc_perf_tests.hpp"
 
+#include "../../test/common/gapi_imgproc_tests_common.hpp"
+
 namespace opencv_test
 {
 
@@ -491,6 +493,49 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(MorphologyExPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    cv::MorphTypes op = cv::MORPH_ERODE;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, op, compile_args) = GetParam();
+
+    initMatrixRandN(type, sz, type, false);
+
+    cv::MorphShapes defShape = cv::MORPH_RECT;
+    int defKernSize = 3;
+    cv::Mat kernel = cv::getStructuringElement(defShape, cv::Size(defKernSize, defKernSize));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::morphologyEx(in_mat1, out_mat_ocv, op, kernel);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::morphologyEx(in, op, kernel);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(SobelPerfTest, TestPerformance)
 {
     compare_f cmpF;
@@ -750,6 +795,332 @@ PERF_TEST_P_(GoodFeaturesPerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(FindContoursPerfTest, TestPerformance)
+{
+    CompareMats cmpF;
+    MatType type;
+    cv::Size sz;
+    cv::RetrievalModes mode;
+    cv::ContourApproximationModes method;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, mode, method, compile_args) = GetParam();
+
+    cv::Mat in;
+    initMatForFindingContours(in, sz, type);
+    cv::Point offset = cv::Point();
+    std::vector<cv::Vec4i> out_hier_gapi = std::vector<cv::Vec4i>();
+
+    std::vector<std::vector<cv::Point>> out_cnts_gapi;
+    cv::GComputation c(findContoursTestGAPI(in, mode, method, std::move(compile_args),
+                                            out_cnts_gapi, out_hier_gapi, offset));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in, offset), gout(out_cnts_gapi));
+    }
+
+    findContoursTestOpenCVCompare(in, mode, method, out_cnts_gapi, out_hier_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FindContoursHPerfTest, TestPerformance)
+{
+    CompareMats cmpF;
+    MatType type;
+    cv::Size sz;
+    cv::RetrievalModes mode;
+    cv::ContourApproximationModes method;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, mode, method, compile_args) = GetParam();
+
+    cv::Mat in;
+    initMatForFindingContours(in, sz, type);
+    cv::Point offset = cv::Point();
+
+    std::vector<std::vector<cv::Point>> out_cnts_gapi;
+    std::vector<cv::Vec4i>              out_hier_gapi;
+    cv::GComputation c(findContoursTestGAPI<HIERARCHY>(in, mode, method, std::move(compile_args),
+                                                       out_cnts_gapi, out_hier_gapi, offset));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in, offset), gout(out_cnts_gapi, out_hier_gapi));
+    }
+
+    findContoursTestOpenCVCompare<HIERARCHY>(in, mode, method, out_cnts_gapi, out_hier_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BoundingRectMatPerfTest, TestPerformance)
+{
+    CompareRects cmpF;
+    cv::Size sz;
+    MatType type;
+    bool initByVector = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, initByVector, compile_args) = GetParam();
+
+    if (initByVector)
+    {
+        initMatByPointsVectorRandU<cv::Point_>(type, sz, -1);
+    }
+    else
+    {
+        initMatrixRandU(type, sz, -1, false);
+    }
+
+    cv::Rect out_rect_gapi;
+    cv::GComputation c(boundingRectTestGAPI(in_mat1, std::move(compile_args), out_rect_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_rect_gapi));
+    }
+
+    boundingRectTestOpenCVCompare(in_mat1, out_rect_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(BoundingRectVector32SPerfTest, TestPerformance)
+{
+    CompareRects cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    std::vector<cv::Point2i> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Rect out_rect_gapi;
+    cv::GComputation c(boundingRectTestGAPI(in_vector, std::move(compile_args), out_rect_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_rect_gapi));
+    }
+
+    boundingRectTestOpenCVCompare(in_vector, out_rect_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(BoundingRectVector32FPerfTest, TestPerformance)
+{
+    CompareRects cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    std::vector<cv::Point2f> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Rect out_rect_gapi;
+    cv::GComputation c(boundingRectTestGAPI(in_vector, std::move(compile_args), out_rect_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_rect_gapi));
+    }
+
+    boundingRectTestOpenCVCompare(in_vector, out_rect_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(FitLine2DMatVectorPerfTest, TestPerformance)
+{
+    CompareVecs<float, 4> cmpF;
+    cv::Size sz;
+    MatType type;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, distType, compile_args) = GetParam();
+
+    initMatByPointsVectorRandU<cv::Point_>(type, sz, -1);
+
+    cv::Vec4f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_mat1, distType, std::move(compile_args), out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_mat1, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine2DVector32SPerfTest, TestPerformance)
+{
+    CompareVecs<float, 4> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point2i> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec4f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine2DVector32FPerfTest, TestPerformance)
+{
+    CompareVecs<float, 4> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point2f> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec4f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine2DVector64FPerfTest, TestPerformance)
+{
+    CompareVecs<float, 4> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point2d> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec4f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine3DMatVectorPerfTest, TestPerformance)
+{
+    CompareVecs<float, 6> cmpF;
+    cv::Size sz;
+    MatType type;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, distType, compile_args) = GetParam();
+
+    initMatByPointsVectorRandU<cv::Point3_>(type, sz, -1);
+
+    cv::Vec6f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_mat1, distType, std::move(compile_args), out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_mat1, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine3DVector32SPerfTest, TestPerformance)
+{
+    CompareVecs<float, 6> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point3i> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec6f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine3DVector32FPerfTest, TestPerformance)
+{
+    CompareVecs<float, 6> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point3f> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec6f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FitLine3DVector64FPerfTest, TestPerformance)
+{
+    CompareVecs<float, 6> cmpF;
+    cv::Size sz;
+    cv::DistanceTypes distType;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, distType, compile_args) = GetParam();
+
+    std::vector<cv::Point3d> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
+
+    cv::Vec6f out_vec_gapi;
+    cv::GComputation c(fitLineTestGAPI(in_vector, distType, std::move(compile_args),
+                                       out_vec_gapi));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_vector), cv::gout(out_vec_gapi));
+    }
+
+    fitLineTestOpenCVCompare(in_vector, distType, out_vec_gapi, cmpF);
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(EqHistPerfTest, TestPerformance)
 {
     compare_f cmpF = get<0>(GetParam());
@@ -788,6 +1159,44 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(BGR2RGBPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    initMatrixRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2RGB);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2RGB(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(RGB2GrayPerfTest, TestPerformance)
 {
     compare_f cmpF = get<0>(GetParam());
@@ -940,6 +1349,158 @@ PERF_TEST_P_(YUV2RGBPerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(BGR2I420PerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    initMatrixRandN(CV_8UC3, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2YUV_I420);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2I420(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), Size(sz.width, sz.height * 3 / 2));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(RGB2I420PerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    initMatrixRandN(CV_8UC3, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2YUV_I420);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2I420(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), Size(sz.width, sz.height * 3 / 2));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(I4202BGRPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    initMatrixRandN(CV_8UC1, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2BGR_I420);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::I4202BGR(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), Size(sz.width, sz.height * 2 / 3));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(I4202RGBPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, compile_args) = GetParam();
+
+    initMatrixRandN(CV_8UC1, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2RGB_I420);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::I4202RGB(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), Size(sz.width, sz.height * 2 / 3));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(RGB2LabPerfTest, TestPerformance)
 {
     compare_f cmpF = get<0>(GetParam());
diff --git a/modules/gapi/perf/common/gapi_video_perf_tests.hpp b/modules/gapi/perf/common/gapi_video_perf_tests.hpp
index b59d6b74d8..a4796fab89 100644
--- a/modules/gapi/perf/common/gapi_video_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_video_perf_tests.hpp
@@ -26,6 +26,15 @@ class OptFlowLKForPyrPerfTest : public TestPerfParams<tuple<std::string,int,tupl
 class BuildPyr_CalcOptFlow_PipelinePerfTest : public TestPerfParams<tuple<std::string,int,int,bool,
                                                                           cv::GCompileArgs>> {};
 
+class BackgroundSubtractorPerfTest:
+    public TestPerfParams<tuple<cv::gapi::video::BackgroundSubtractorType, std::string,
+                                bool, double, std::size_t, cv::GCompileArgs, CompareMats>> {};
+
+class KalmanFilterControlPerfTest   :
+    public TestPerfParams<tuple<MatType2, int, int, size_t, bool, cv::GCompileArgs>> {};
+class KalmanFilterNoControlPerfTest :
+    public TestPerfParams<tuple<MatType2, int, int, size_t, bool, cv::GCompileArgs>> {};
+
 } // opencv_test
 
 #endif // OPENCV_GAPI_VIDEO_PERF_TESTS_HPP
diff --git a/modules/gapi/perf/common/gapi_video_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_video_perf_tests_inl.hpp
index d1c81a9245..700d2f6e7e 100644
--- a/modules/gapi/perf/common/gapi_video_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_video_perf_tests_inl.hpp
@@ -44,7 +44,7 @@ PERF_TEST_P_(BuildOptFlowPyramidPerfTest, TestPerformance)
     outMaxLevelGAPI = static_cast<int>(outMaxLevelSc[0]);
 
     // Comparison //////////////////////////////////////////////////////////////
-    compareOutputPyramids(outOCV, outGAPI);
+    compareOutputPyramids(outGAPI, outOCV);
 
     SANITY_CHECK_NOTHING();
 }
@@ -74,7 +74,7 @@ PERF_TEST_P_(OptFlowLKPerfTest, TestPerformance)
     }
 
     // Comparison //////////////////////////////////////////////////////////////
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 
     SANITY_CHECK_NOTHING();
 }
@@ -109,7 +109,7 @@ PERF_TEST_P_(OptFlowLKForPyrPerfTest, TestPerformance)
     }
 
     // Comparison //////////////////////////////////////////////////////////////
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 
     SANITY_CHECK_NOTHING();
 }
@@ -147,13 +147,252 @@ PERF_TEST_P_(BuildPyr_CalcOptFlow_PipelinePerfTest, TestPerformance)
     }
 
     // Comparison //////////////////////////////////////////////////////////////
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 
     SANITY_CHECK_NOTHING();
 }
 
 //------------------------------------------------------------------------------
 
+#ifdef HAVE_OPENCV_VIDEO
+
+PERF_TEST_P_(BackgroundSubtractorPerfTest, TestPerformance)
+{
+    namespace gvideo = cv::gapi::video;
+    initTestDataPath();
+
+    gvideo::BackgroundSubtractorType opType;
+    std::string filePath = "";
+    bool detectShadows = false;
+    double learningRate = -1.;
+    std::size_t testNumFrames = 0;
+    cv::GCompileArgs compileArgs;
+    CompareMats cmpF;
+
+    std::tie(opType, filePath, detectShadows, learningRate, testNumFrames,
+             compileArgs, cmpF) = GetParam();
+
+    const int histLength = 500;
+    double thr = -1;
+    switch (opType)
+    {
+        case gvideo::TYPE_BS_MOG2:
+        {
+            thr = 16.;
+            break;
+        }
+        case gvideo::TYPE_BS_KNN:
+        {
+            thr = 400.;
+            break;
+        }
+        default:
+            FAIL() << "unsupported type of BackgroundSubtractor";
+    }
+    const gvideo::BackgroundSubtractorParams bsp(opType, histLength, thr, detectShadows,
+                                                 learningRate);
+
+    // Retrieving frames
+    std::vector<cv::Mat> frames;
+    frames.reserve(testNumFrames);
+    {
+        cv::Mat frame;
+        cv::VideoCapture cap;
+        if (!cap.open(findDataFile(filePath)))
+            throw SkipTestException("Video file can not be opened");
+        for (std::size_t i = 0; i < testNumFrames && cap.read(frame); i++)
+        {
+            frames.push_back(frame);
+        }
+    }
+    GAPI_Assert(testNumFrames == frames.size() && "Can't read required number of frames");
+
+    // G-API graph declaration
+    cv::GMat in;
+    cv::GMat out = cv::gapi::BackgroundSubtractor(in, bsp);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+    auto cc = c.compile(cv::descr_of(frames[0]), std::move(compileArgs));
+
+    cv::Mat gapiForeground;
+    TEST_CYCLE()
+    {
+        cc.prepareForNewStream();
+        for (size_t i = 0; i < testNumFrames; i++)
+        {
+            cc(cv::gin(frames[i]), cv::gout(gapiForeground));
+        }
+    }
+
+    // OpenCV Background Subtractor declaration
+    cv::Ptr<cv::BackgroundSubtractor> pOCVBackSub;
+    if (opType == gvideo::TYPE_BS_MOG2)
+        pOCVBackSub = cv::createBackgroundSubtractorMOG2(histLength, thr, detectShadows);
+    else if (opType == gvideo::TYPE_BS_KNN)
+        pOCVBackSub = cv::createBackgroundSubtractorKNN(histLength, thr, detectShadows);
+    cv::Mat ocvForeground;
+    for (size_t i = 0; i < testNumFrames; i++)
+    {
+        pOCVBackSub->apply(frames[i], ocvForeground, learningRate);
+    }
+    // Validation
+    EXPECT_TRUE(cmpF(gapiForeground, ocvForeground));
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+inline void generateInputKalman(const int mDim, const MatType2& type,
+                                const size_t testNumMeasurements, const bool receiveRandMeas,
+                                std::vector<bool>&    haveMeasurements,
+                                std::vector<cv::Mat>& measurements)
+{
+    cv::RNG& rng = cv::theRNG();
+    measurements.clear();
+    haveMeasurements = std::vector<bool>(testNumMeasurements, true);
+    for (size_t i = 0; i < testNumMeasurements; i++)
+    {
+        if (receiveRandMeas)
+        {
+            haveMeasurements[i] = rng(2u) == 1; // returns 0 or 1 - whether we have measurement
+                                                // at this iteration or not
+        } // if not - testing the slowest case in which we have measurements at every iteration
+
+        cv::Mat measurement = cv::Mat::zeros(mDim, 1, type);
+        if (haveMeasurements[i])
+        {
+            cv::randu(measurement, cv::Scalar::all(-1), cv::Scalar::all(1));
+        }
+        measurements.push_back(measurement.clone());
+    }
+}
+
+inline void generateInputKalman(const int mDim, const int cDim, const MatType2& type,
+                                const size_t testNumMeasurements, const bool receiveRandMeas,
+                                std::vector<bool>&    haveMeasurements,
+                                std::vector<cv::Mat>& measurements,
+                                std::vector<cv::Mat>& ctrls)
+{
+    generateInputKalman(mDim, type, testNumMeasurements, receiveRandMeas,
+                        haveMeasurements, measurements);
+    ctrls.clear();
+    cv::Mat ctrl(cDim, 1, type);
+    for (size_t i = 0; i < testNumMeasurements; i++)
+    {
+        cv::randu(ctrl, cv::Scalar::all(-1), cv::Scalar::all(1));
+        ctrls.push_back(ctrl.clone());
+    }
+}
+
+PERF_TEST_P_(KalmanFilterControlPerfTest, TestPerformance)
+{
+    MatType2 type = -1;
+    int dDim = -1, mDim = -1;
+    size_t testNumMeasurements = 0;
+    bool receiveRandMeas = true;
+    cv::GCompileArgs compileArgs;
+    std::tie(type, dDim, mDim, testNumMeasurements, receiveRandMeas, compileArgs) = GetParam();
+
+    const int cDim = 2;
+    cv::gapi::KalmanParams kp;
+    initKalmanParams(type, dDim, mDim, cDim, kp);
+
+    // Generating input
+    std::vector<bool> haveMeasurements;
+    std::vector<cv::Mat> measurements, ctrls;
+    generateInputKalman(mDim, cDim, type, testNumMeasurements, receiveRandMeas,
+                        haveMeasurements, measurements, ctrls);
+
+    // G-API graph declaration
+    cv::GMat m, ctrl;
+    cv::GOpaque<bool> have_m;
+    cv::GMat out = cv::gapi::KalmanFilter(m, have_m, ctrl, kp);
+    cv::GComputation c(cv::GIn(m, have_m, ctrl), cv::GOut(out));
+    auto cc = c.compile(
+        cv::descr_of(cv::gin(cv::Mat(mDim, 1, type), true, cv::Mat(cDim, 1, type))),
+        std::move(compileArgs));
+
+    cv::Mat gapiKState(dDim, 1, type);
+    TEST_CYCLE()
+    {
+        cc.prepareForNewStream();
+        for (size_t i = 0; i < testNumMeasurements; i++)
+        {
+            bool hvMeas = haveMeasurements[i];
+            cc(cv::gin(measurements[i], hvMeas, ctrls[i]), cv::gout(gapiKState));
+        }
+    }
+
+    // OpenCV reference KalmanFilter initialization
+    cv::KalmanFilter ocvKalman(dDim, mDim, cDim, type);
+    initKalmanFilter(kp, true, ocvKalman);
+
+    cv::Mat ocvKState(dDim, 1, type);
+    for (size_t i = 0; i < testNumMeasurements; i++)
+    {
+        ocvKState = ocvKalman.predict(ctrls[i]);
+        if (haveMeasurements[i])
+            ocvKState = ocvKalman.correct(measurements[i]);
+    }
+    // Validation
+    EXPECT_TRUE(AbsExact().to_compare_f()(gapiKState, ocvKState));
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(KalmanFilterNoControlPerfTest, TestPerformance)
+{
+    MatType2 type = -1;
+    int dDim = -1, mDim = -1;
+    size_t testNumMeasurements = 0;
+    bool receiveRandMeas = true;
+    cv::GCompileArgs compileArgs;
+    std::tie(type, dDim, mDim, testNumMeasurements, receiveRandMeas, compileArgs) = GetParam();
+
+    const int cDim = 0;
+    cv::gapi::KalmanParams kp;
+    initKalmanParams(type, dDim, mDim, cDim, kp);
+
+    // Generating input
+    std::vector<bool> haveMeasurements;
+    std::vector<cv::Mat> measurements;
+    generateInputKalman(mDim, type, testNumMeasurements, receiveRandMeas,
+                        haveMeasurements, measurements);
+
+    // G-API graph declaration
+    cv::GMat m;
+    cv::GOpaque<bool> have_m;
+    cv::GMat out = cv::gapi::KalmanFilter(m, have_m, kp);
+    cv::GComputation c(cv::GIn(m, have_m), cv::GOut(out));
+    auto cc = c.compile(cv::descr_of(cv::gin(cv::Mat(mDim, 1, type), true)),
+                        std::move(compileArgs));
+
+    cv::Mat gapiKState(dDim, 1, type);
+    TEST_CYCLE()
+    {
+        cc.prepareForNewStream();
+        for (size_t i = 0; i < testNumMeasurements; i++)
+        {
+            bool hvMeas = haveMeasurements[i];
+            cc(cv::gin(measurements[i], hvMeas), cv::gout(gapiKState));
+        }
+    }
+
+    // OpenCV reference KalmanFilter declaration
+    cv::KalmanFilter ocvKalman(dDim, mDim, cDim, type);
+    initKalmanFilter(kp, false, ocvKalman);
+
+    cv::Mat ocvKState(dDim, 1, type);
+    for (size_t i = 0; i < testNumMeasurements; i++)
+    {
+        ocvKState = ocvKalman.predict();
+        if (haveMeasurements[i])
+            ocvKState = ocvKalman.correct(measurements[i]);
+    }
+    // Validation
+    EXPECT_TRUE(AbsExact().to_compare_f()(gapiKState, ocvKState));
+    SANITY_CHECK_NOTHING();
+}
+#endif // HAVE_OPENCV_VIDEO
+
 } // opencv_test
 
 #endif // OPENCV_GAPI_VIDEO_PERF_TESTS_INL_HPP
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
index ffc4b1a646..8385169050 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #include "../perf_precomp.hpp"
@@ -282,6 +282,35 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestCPU, ConvertToPerfTest,
             Values(0.0),
             Values(cv::compile_args(CORE_CPU))));
 
+INSTANTIATE_TEST_CASE_P(KMeansNDPerfTestCPU, KMeansNDPerfTest,
+                        Combine(Values(cv::Size(1, 20),
+                                       cv::Size(16, 4096)),
+                                Values(AbsTolerance(0.01).to_compare_obj()),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS,
+                                       cv::KMEANS_PP_CENTERS,
+                                       cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(KMeans2DPerfTestCPU, KMeans2DPerfTest,
+                        Combine(Values(20, 4096),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS,
+                                       cv::KMEANS_PP_CENTERS,
+                                       cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(KMeans3DPerfTestCPU, KMeans3DPerfTest,
+                        Combine(Values(20, 4096),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS,
+                                       cv::KMEANS_PP_CENTERS,
+                                       cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS),
+                                Values(cv::compile_args(CORE_CPU))));
+
 INSTANTIATE_TEST_CASE_P(ResizePerfTestCPU, ResizePerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(CV_8UC1, CV_16UC1, CV_16SC1),
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
index 2234353c1f..6be1e1a8c4 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -147,7 +147,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest,
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest,
     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+            Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
+                   CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
+                   CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
             Values(cv::compile_args(CORE_FLUID))));
 
 // INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest,
@@ -233,11 +235,6 @@ INSTANTIATE_TEST_CASE_P(Split3PerfTestFluid, Split3PerfTest,
 //         Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
 //         Values(cv::compile_args(CORE_FLUID))));
 
-// INSTANTIATE_TEST_CASE_P(CopyPerfTestFluid, CopyPerfTest,
-//     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(cv::compile_args(CORE_FLUID))));
-
 // INSTANTIATE_TEST_CASE_P(ConcatHorPerfTestFluid, ConcatHorPerfTest,
 //     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
 //         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
diff --git a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp
index 4de1b18308..d4c37c10af 100644
--- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp
@@ -104,6 +104,26 @@ INSTANTIATE_TEST_CASE_P(Dilate3x3PerfTestCPU, Dilate3x3PerfTest,
         Values(1, 2, 4),
         Values(cv::compile_args(IMGPROC_CPU))));
 
+INSTANTIATE_TEST_CASE_P(MorphologyExPerfTestCPU, MorphologyExPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::MorphTypes::MORPH_ERODE,
+                                       cv::MorphTypes::MORPH_DILATE,
+                                       cv::MorphTypes::MORPH_OPEN,
+                                       cv::MorphTypes::MORPH_CLOSE,
+                                       cv::MorphTypes::MORPH_GRADIENT,
+                                       cv::MorphTypes::MORPH_TOPHAT,
+                                       cv::MorphTypes::MORPH_BLACKHAT),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MorphologyExHitMissPerfTestCPU, MorphologyExPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::MorphTypes::MORPH_HITMISS),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
 INSTANTIATE_TEST_CASE_P(SobelPerfTestCPU, SobelPerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
@@ -174,11 +194,136 @@ INSTANTIATE_TEST_CASE_P(GoodFeaturesInternalPerfTestCPU, GoodFeaturesPerfTest,
             Values(true),
             Values(cv::compile_args(IMGPROC_CPU))));
 
+INSTANTIATE_TEST_CASE_P(FindContoursPerfTestCPU, FindContoursPerfTest,
+                        Combine(Values(AbsExact().to_compare_obj()),
+                                Values(CV_8UC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE),
+                                Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FindContours32SPerfTestCPU, FindContoursPerfTest,
+                        Combine(Values(AbsExact().to_compare_obj()),
+                                Values(CV_32SC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(RETR_CCOMP, RETR_FLOODFILL),
+                                Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FindContoursHPerfTestCPU, FindContoursHPerfTest,
+                        Combine(Values(AbsExact().to_compare_obj()),
+                                Values(CV_8UC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE),
+                                Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FindContoursH32SPerfTestCPU, FindContoursHPerfTest,
+                        Combine(Values(AbsExact().to_compare_obj()),
+                                Values(CV_32SC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(RETR_CCOMP, RETR_FLOODFILL),
+                                Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoundingRectMatPerfTestCPU, BoundingRectMatPerfTest,
+                        Combine(Values(IoUToleranceRect(0).to_compare_obj()),
+                                Values(CV_8UC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(false),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoundingRectMatVectorPerfTestCPU, BoundingRectMatPerfTest,
+                        Combine(Values(IoUToleranceRect(1e-5).to_compare_obj()),
+                                Values(CV_32S, CV_32F),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(true),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoundingRectVector32SPerfTestCPU, BoundingRectVector32SPerfTest,
+                        Combine(Values(IoUToleranceRect(0).to_compare_obj()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoundingRectVector32FPerfTestCPU, BoundingRectVector32FPerfTest,
+                        Combine(Values(IoUToleranceRect(1e-5).to_compare_obj()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine2DMatVectorPerfTestCPU, FitLine2DMatVectorPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 4>(0.01).to_compare_obj()),
+                                Values(CV_8U, CV_8S, CV_16U, CV_16S,
+                                       CV_32S, CV_32F, CV_64F),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine2DVector32SPerfTestCPU, FitLine2DVector32SPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 4>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine2DVector32FPerfTestCPU, FitLine2DVector32FPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 4>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine2DVector64FPerfTestCPU, FitLine2DVector64FPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 4>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine3DMatVectorPerfTestCPU, FitLine3DMatVectorPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 6>(0.01).to_compare_obj()),
+                                Values(CV_8U, CV_8S, CV_16U, CV_16S,
+                                       CV_32S, CV_32F, CV_64F),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine3DVector32SPerfTestCPU, FitLine3DVector32SPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 6>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine3DVector32FPerfTestCPU, FitLine3DVector32FPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 6>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FitLine3DVector64FPerfTestCPU, FitLine3DVector64FPerfTest,
+                        Combine(Values(RelDiffToleranceVec<float, 6>(0.01).to_compare_obj()),
+                                Values(cv::Size(8, 0), cv::Size(1024, 0)),
+                                Values(DIST_L1, DIST_L2, DIST_L12, DIST_FAIR,
+                                       DIST_WELSCH, DIST_HUBER),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
 INSTANTIATE_TEST_CASE_P(EqHistPerfTestCPU, EqHistPerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(szVGA, sz720p, sz1080p),
         Values(cv::compile_args(IMGPROC_CPU))));
 
+INSTANTIATE_TEST_CASE_P(BGR2RGBPerfTestCPU, BGR2RGBPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
 INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestCPU, RGB2GrayPerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(szVGA, sz720p, sz1080p),
@@ -199,6 +344,26 @@ INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestCPU, YUV2RGBPerfTest,
         Values(szVGA, sz720p, sz1080p),
         Values(cv::compile_args(IMGPROC_CPU))));
 
+INSTANTIATE_TEST_CASE_P(BGR2I420PerfTestCPU, BGR2I420PerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2I420PerfTestCPU, RGB2I420PerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(I4202BGRPerfTestCPU, I4202BGRPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(I4202RGBPerfTestCPU, I4202RGBPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
 INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestCPU, RGB2LabPerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(szVGA, sz720p, sz1080p),
diff --git a/modules/gapi/perf/cpu/gapi_video_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_video_perf_tests_cpu.cpp
index 4ad71636a3..0da776a3ff 100644
--- a/modules/gapi/perf/cpu/gapi_video_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_video_perf_tests_cpu.cpp
@@ -70,7 +70,7 @@ INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(OptFlowLKForPyrPerfTestCPU), OptFlowLKF
                                       Values(cv::TermCriteria(cv::TermCriteria::COUNT |
                                                               cv::TermCriteria::EPS,
                                                               30, 0.01)),
-                                      Values(true, false),
+                                      testing::Bool(),
                                       Values(cv::compile_args(VIDEO_CPU))));
 
 INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(OptFlowLKInternalPerfTestCPU),
@@ -90,7 +90,7 @@ INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BuildPyr_CalcOptFlow_PipelinePerfTestCP
                               Combine(Values("cv/optflow/frames/1080p_%02d.png"),
                                       Values(7, 11),
                                       Values(1000),
-                                      Values(true, false),
+                                      testing::Bool(),
                                       Values(cv::compile_args(VIDEO_CPU))));
 
 INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BuildPyr_CalcOptFlow_PipelineInternalTestPerfCPU),
@@ -100,4 +100,33 @@ INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BuildPyr_CalcOptFlow_PipelineInternalTe
                                       Values(3),
                                       Values(true),
                                       Values(cv::compile_args(VIDEO_CPU))));
+
+INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BackgroundSubtractorPerfTestCPU),
+                              BackgroundSubtractorPerfTest,
+                              Combine(Values(cv::gapi::video::TYPE_BS_MOG2,
+                                             cv::gapi::video::TYPE_BS_KNN),
+                                      Values("cv/video/768x576.avi", "cv/video/1920x1080.avi"),
+                                      testing::Bool(),
+                                      Values(0., 0.5, 1.),
+                                      Values(5),
+                                      Values(cv::compile_args(VIDEO_CPU)),
+                                      Values(AbsExact().to_compare_obj())));
+
+INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(KalmanFilterControlPerfTestCPU),
+                              KalmanFilterControlPerfTest,
+                              Combine(Values(CV_32FC1, CV_64FC1),
+                                      Values(2, 5),
+                                      Values(2, 5),
+                                      Values(5),
+                                      testing::Bool(),
+                                      Values(cv::compile_args(VIDEO_CPU))));
+
+INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(KalmanFilterNoControlPerfTestCPU),
+                              KalmanFilterNoControlPerfTest,
+                              Combine(Values(CV_32FC1, CV_64FC1),
+                                      Values(2, 5),
+                                      Values(2, 5),
+                                      Values(5),
+                                      testing::Bool(),
+                                      Values(cv::compile_args(VIDEO_CPU))));
 } // opencv_test
diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
index 2d8c254cd2..955799634c 100644
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -232,11 +232,6 @@ INSTANTIATE_TEST_CASE_P(CropPerfTestGPU, CropPerfTest,
                                 Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
                                 Values(cv::compile_args(CORE_GPU))));
 
-INSTANTIATE_TEST_CASE_P(CopyPerfTestGPU, CopyPerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
-                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::compile_args(CORE_GPU))));
-
 INSTANTIATE_TEST_CASE_P(ConcatHorPerfTestGPU, ConcatHorPerfTest,
                         Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
                                 Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
diff --git a/modules/gapi/samples/semantic_segmentation.cpp b/modules/gapi/samples/semantic_segmentation.cpp
new file mode 100644
index 0000000000..0a6e7231c4
--- /dev/null
+++ b/modules/gapi/samples/semantic_segmentation.cpp
@@ -0,0 +1,133 @@
+#include <opencv2/imgproc.hpp>
+#include <opencv2/gapi/infer/ie.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
+#include <opencv2/highgui.hpp>
+
+const std::string keys =
+    "{ h help |                                     | Print this help message }"
+    "{ input  |                                     | Path to the input video file }"
+    "{ output |                                     | Path to the output video file }"
+    "{ ssm    | semantic-segmentation-adas-0001.xml | Path to OpenVINO IE semantic segmentation model (.xml) }";
+
+// 20 colors for 20 classes of semantic-segmentation-adas-0001
+const std::vector<cv::Vec3b> colors = {
+    { 128, 64,  128 },
+    { 232, 35,  244 },
+    { 70,  70,  70 },
+    { 156, 102, 102 },
+    { 153, 153, 190 },
+    { 153, 153, 153 },
+    { 30,  170, 250 },
+    { 0,   220, 220 },
+    { 35,  142, 107 },
+    { 152, 251, 152 },
+    { 180, 130, 70 },
+    { 60,  20,  220 },
+    { 0,   0,   255 },
+    { 142, 0,   0 },
+    { 70,  0,   0 },
+    { 100, 60,  0 },
+    { 90,  0,   0 },
+    { 230, 0,   0 },
+    { 32,  11,  119 },
+    { 0,   74,  111 },
+};
+
+namespace {
+std::string get_weights_path(const std::string &model_path) {
+    const auto EXT_LEN = 4u;
+    const auto sz = model_path.size();
+    CV_Assert(sz > EXT_LEN);
+
+    auto ext = model_path.substr(sz - EXT_LEN);
+    std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c){
+            return static_cast<unsigned char>(std::tolower(c));
+        });
+    CV_Assert(ext == ".xml");
+    return model_path.substr(0u, sz - EXT_LEN) + ".bin";
+}
+} // anonymous namespace
+
+namespace custom {
+G_API_OP(PostProcessing, <cv::GMat(cv::GMat, cv::GMat)>, "sample.custom.post_processing") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &) {
+        return in;
+    }
+};
+
+GAPI_OCV_KERNEL(OCVPostProcessing, PostProcessing) {
+    static void run(const cv::Mat &in, const cv::Mat &detected_classes, cv::Mat &out) {
+        // This kernel constructs output image by class table and colors vector
+
+        // The semantic-segmentation-adas-0001 output a blob with the shape
+        // [B, C=1, H=1024, W=2048]
+        const int outHeight = 1024;
+        const int outWidth = 2048;
+        cv::Mat maskImg(outHeight, outWidth, CV_8UC3);
+        const int* const classes = detected_classes.ptr<int>();
+        for (int rowId = 0; rowId < outHeight; ++rowId) {
+            for (int colId = 0; colId < outWidth; ++colId) {
+                size_t classId = static_cast<size_t>(classes[rowId * outWidth + colId]);
+                maskImg.at<cv::Vec3b>(rowId, colId) =
+                    classId < colors.size()
+                        ? colors[classId]
+                        : cv::Vec3b{0, 0, 0}; // sample detects 20 classes
+            }
+        }
+        cv::resize(maskImg, out, in.size());
+        const float blending = 0.3f;
+        out = in * blending + out * (1 - blending);
+    }
+};
+} // namespace custom
+
+int main(int argc, char *argv[]) {
+    cv::CommandLineParser cmd(argc, argv, keys);
+    if (cmd.has("help")) {
+        cmd.printMessage();
+        return 0;
+    }
+
+    // Prepare parameters first
+    const std::string input  = cmd.get<std::string>("input");
+    const std::string output = cmd.get<std::string>("output");
+    const auto model_path    = cmd.get<std::string>("ssm");
+    const auto weights_path  = get_weights_path(model_path);
+    const auto device        = "CPU";
+    G_API_NET(SemSegmNet, <cv::GMat(cv::GMat)>, "semantic-segmentation");
+    const auto net = cv::gapi::ie::Params<SemSegmNet> {
+        model_path, weights_path, device
+    };
+    const auto kernels = cv::gapi::kernels<custom::OCVPostProcessing>();
+    const auto networks = cv::gapi::networks(net);
+
+    // Now build the graph
+    cv::GMat in;
+    cv::GMat detected_classes = cv::gapi::infer<SemSegmNet>(in);
+    cv::GMat out = custom::PostProcessing::on(in, detected_classes);
+
+    cv::GStreamingCompiled pipeline = cv::GComputation(cv::GIn(in), cv::GOut(out))
+        .compileStreaming(cv::compile_args(kernels, networks));
+    auto inputs = cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input));
+
+    // The execution part
+    pipeline.setSource(std::move(inputs));
+    pipeline.start();
+
+    cv::VideoWriter writer;
+    cv::Mat outMat;
+    while (pipeline.pull(cv::gout(outMat))) {
+        cv::imshow("Out", outMat);
+        cv::waitKey(1);
+        if (!output.empty()) {
+            if (!writer.isOpened()) {
+                const auto sz = cv::Size{outMat.cols, outMat.rows};
+                writer.open(output, cv::VideoWriter::fourcc('M','J','P','G'), 25.0, sz);
+                CV_Assert(writer.isOpened());
+            }
+            writer << outMat;
+        }
+    }
+    return 0;
+}
diff --git a/modules/gapi/src/api/gbackend.cpp b/modules/gapi/src/api/gbackend.cpp
index fd4a5eb38b..1e7b8a2a8d 100644
--- a/modules/gapi/src/api/gbackend.cpp
+++ b/modules/gapi/src/api/gbackend.cpp
@@ -211,6 +211,9 @@ void bindOutArg(Mag& mag, const RcDesc &rc, const GRunArgP &arg, HandleRMat hand
         }
         break;
     }
+    case GShape::GFRAME:
+        mag.template slot<cv::MediaFrame>()[rc.id] = *util::get<cv::MediaFrame*>(arg);
+        break;
     case GShape::GARRAY:
         mag.template slot<cv::detail::VectorRef>()[rc.id] = util::get<cv::detail::VectorRef>(arg);
         break;
@@ -319,6 +322,9 @@ cv::GRunArgP getObjPtr(Mag& mag, const RcDesc &rc, bool is_umat)
         // debugging this!!!1
         return GRunArgP(const_cast<const Mag&>(mag)
                         .template slot<cv::detail::OpaqueRef>().at(rc.id));
+    case GShape::GFRAME:
+        return GRunArgP(&mag.template slot<cv::MediaFrame>()[rc.id]);
+
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
         break;
@@ -345,6 +351,12 @@ void writeBack(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg)
         break;
     }
 
+    case GShape::GFRAME:
+    {
+        *util::get<cv::MediaFrame*>(g_arg) = mag.template slot<cv::MediaFrame>().at(rc.id);
+        break;
+    }
+
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
         break;
diff --git a/modules/gapi/src/api/gcall.cpp b/modules/gapi/src/api/gcall.cpp
index 6a2121bd36..618107f346 100644
--- a/modules/gapi/src/api/gcall.cpp
+++ b/modules/gapi/src/api/gcall.cpp
@@ -69,6 +69,11 @@ cv::detail::GOpaqueU cv::GCall::yieldOpaque(int output)
     return cv::detail::GOpaqueU(m_priv->m_node, output);
 }
 
+cv::GFrame cv::GCall::yieldFrame(int output)
+{
+    return cv::GFrame(m_priv->m_node, output);
+}
+
 cv::GCall::Priv& cv::GCall::priv()
 {
     return *m_priv;
diff --git a/modules/gapi/src/api/gcomputation.cpp b/modules/gapi/src/api/gcomputation.cpp
index 5668cddc93..c57c66b879 100644
--- a/modules/gapi/src/api/gcomputation.cpp
+++ b/modules/gapi/src/api/gcomputation.cpp
@@ -23,6 +23,31 @@
 #include "compiler/gmodelbuilder.hpp"
 #include "compiler/gcompiler.hpp"
 #include "compiler/gcompiled_priv.hpp"
+#include "compiler/gstreaming_priv.hpp"
+
+static cv::GTypesInfo collectInfo(const cv::gimpl::GModel::ConstGraph& g,
+                                  const std::vector<ade::NodeHandle>& nhs) {
+    cv::GTypesInfo info;
+    info.reserve(nhs.size());
+
+    ade::util::transform(nhs, std::back_inserter(info), [&g](const ade::NodeHandle& nh) {
+        const auto& data = g.metadata(nh).get<cv::gimpl::Data>();
+        return cv::GTypeInfo{data.shape, data.kind, data.ctor};
+    });
+
+    return info;
+}
+
+// NB: This function is used to collect graph input/output info.
+// Needed for python bridge to unpack inputs and constructs outputs properly.
+static cv::GraphInfo::Ptr collectGraphInfo(const cv::GComputation::Priv& priv)
+{
+    auto g = cv::gimpl::GCompiler::makeGraph(priv);
+    cv::gimpl::GModel::ConstGraph cgr(*g);
+    auto in_info  = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().in_nhs);
+    auto out_info = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().out_nhs);
+    return cv::GraphInfo::Ptr(new cv::GraphInfo{std::move(in_info), std::move(out_info)});
+}
 
 // cv::GComputation private implementation /////////////////////////////////////
 // <none>
@@ -105,8 +130,37 @@ cv::GStreamingCompiled cv::GComputation::compileStreaming(GMetaArgs &&metas, GCo
 
 cv::GStreamingCompiled cv::GComputation::compileStreaming(GCompileArgs &&args)
 {
+    // NB: Used by python bridge
+    if (!m_priv->m_info)
+    {
+        m_priv->m_info = collectGraphInfo(*m_priv);
+    }
+
     cv::gimpl::GCompiler comp(*this, {}, std::move(args));
-    return comp.compileStreaming();
+    auto compiled = comp.compileStreaming();
+
+    compiled.priv().setInInfo(m_priv->m_info->inputs);
+    compiled.priv().setOutInfo(m_priv->m_info->outputs);
+
+    return compiled;
+}
+
+cv::GStreamingCompiled cv::GComputation::compileStreaming(const cv::detail::ExtractMetaCallback &callback,
+                                                                GCompileArgs                   &&args)
+{
+    // NB: Used by python bridge
+    if (!m_priv->m_info)
+    {
+        m_priv->m_info = collectGraphInfo(*m_priv);
+    }
+
+    auto ins = callback(m_priv->m_info->inputs);
+    cv::gimpl::GCompiler comp(*this, std::move(ins), std::move(args));
+    auto compiled = comp.compileStreaming();
+    compiled.priv().setInInfo(m_priv->m_info->inputs);
+    compiled.priv().setOutInfo(m_priv->m_info->outputs);
+
+    return compiled;
 }
 
 // FIXME: Introduce similar query/test method for GMetaArgs as a building block
@@ -172,50 +226,25 @@ void cv::GComputation::apply(const std::vector<cv::Mat> &ins,
 }
 
 // NB: This overload is called from python code
-cv::GRunArgs cv::GComputation::apply(GRunArgs &&ins, GCompileArgs &&args)
+cv::GRunArgs cv::GComputation::apply(const cv::detail::ExtractArgsCallback &callback,
+                                           GCompileArgs                   &&args)
 {
-    recompile(descr_of(ins), std::move(args));
+    // NB: Used by python bridge
+    if (!m_priv->m_info)
+    {
+        m_priv->m_info = collectGraphInfo(*m_priv);
+    }
 
-    const auto& out_info = m_priv->m_lastCompiled.priv().outInfo();
+    auto ins = callback(m_priv->m_info->inputs);
+    recompile(descr_of(ins), std::move(args));
 
     GRunArgs run_args;
     GRunArgsP outs;
-    run_args.reserve(out_info.size());
-    outs.reserve(out_info.size());
+    run_args.reserve(m_priv->m_info->outputs.size());
+    outs.reserve(m_priv->m_info->outputs.size());
+
+    cv::detail::constructGraphOutputs(m_priv->m_info->outputs, run_args, outs);
 
-    for (auto&& info : out_info)
-    {
-        switch (info.shape)
-        {
-            case cv::GShape::GMAT:
-            {
-                run_args.emplace_back(cv::Mat{});
-                outs.emplace_back(&cv::util::get<cv::Mat>(run_args.back()));
-                break;
-            }
-            case cv::GShape::GSCALAR:
-            {
-                run_args.emplace_back(cv::Scalar{});
-                outs.emplace_back(&cv::util::get<cv::Scalar>(run_args.back()));
-                break;
-            }
-            case cv::GShape::GARRAY:
-            {
-                switch (info.kind)
-                {
-                    case cv::detail::OpaqueKind::CV_POINT2F:
-                        run_args.emplace_back(cv::detail::VectorRef{std::vector<cv::Point2f>{}});
-                        outs.emplace_back(cv::util::get<cv::detail::VectorRef>(run_args.back()));
-                        break;
-                    default:
-                        util::throw_error(std::logic_error("Unsupported kind for GArray"));
-                }
-                break;
-            }
-            default:
-                util::throw_error(std::logic_error("Only cv::GMat and cv::GScalar are supported for python output"));
-        }
-    }
     m_priv->m_lastCompiled(std::move(ins), std::move(outs));
     return run_args;
 }
diff --git a/modules/gapi/src/api/gcomputation_priv.hpp b/modules/gapi/src/api/gcomputation_priv.hpp
index 19d89fdcbd..e88fefb808 100644
--- a/modules/gapi/src/api/gcomputation_priv.hpp
+++ b/modules/gapi/src/api/gcomputation_priv.hpp
@@ -21,6 +21,13 @@
 
 namespace cv {
 
+struct GraphInfo
+{
+    using Ptr = std::shared_ptr<GraphInfo>;
+    cv::GTypesInfo inputs;
+    cv::GTypesInfo outputs;
+};
+
 class GComputation::Priv
 {
 public:
@@ -36,9 +43,10 @@ public:
         , Dump    // A deserialized graph
         >;
 
-    GCompiled   m_lastCompiled;
-    GMetaArgs   m_lastMetas; // TODO: make GCompiled remember its metas?
-    Shape       m_shape;
+    GCompiled      m_lastCompiled;
+    GMetaArgs      m_lastMetas; // TODO: make GCompiled remember its metas?
+    Shape          m_shape;
+    GraphInfo::Ptr m_info;      // NB: Used by python bridge
 };
 
 }
diff --git a/modules/gapi/src/api/gframe.cpp b/modules/gapi/src/api/gframe.cpp
index 3f228f1a65..1acaa9b766 100644
--- a/modules/gapi/src/api/gframe.cpp
+++ b/modules/gapi/src/api/gframe.cpp
@@ -8,6 +8,7 @@
 #include "precomp.hpp"
 
 #include <opencv2/gapi/gframe.hpp>
+#include <opencv2/gapi/media.hpp>
 
 #include "api/gorigin.hpp"
 
@@ -34,6 +35,10 @@ bool GFrameDesc::operator== (const GFrameDesc &rhs) const {
     return fmt == rhs.fmt && size == rhs.size;
 }
 
+GFrameDesc descr_of(const cv::MediaFrame &frame) {
+    return frame.desc();
+}
+
 std::ostream& operator<<(std::ostream& os, const cv::GFrameDesc &d) {
     os << '[';
     switch (d.fmt) {
diff --git a/modules/gapi/src/api/ginfer.cpp b/modules/gapi/src/api/ginfer.cpp
index 156f8938c4..e3cc94041c 100644
--- a/modules/gapi/src/api/ginfer.cpp
+++ b/modules/gapi/src/api/ginfer.cpp
@@ -7,17 +7,12 @@
 
 #include "precomp.hpp"
 
-#include <functional> // hash
-#include <numeric> // accumulate
-#include <unordered_set>
-#include <iterator>
-
-#include <ade/util/algorithm.hpp>
-
 #include <opencv2/gapi/infer.hpp>
 
-cv::gapi::GNetPackage::GNetPackage(std::initializer_list<GNetParam> &&ii)
-    : networks(std::move(ii)) {
+#include <unordered_set>
+
+cv::gapi::GNetPackage::GNetPackage(std::initializer_list<GNetParam> ii)
+    : networks(ii) {
 }
 
 std::vector<cv::gapi::GBackend> cv::gapi::GNetPackage::backends() const {
@@ -25,59 +20,3 @@ std::vector<cv::gapi::GBackend> cv::gapi::GNetPackage::backends() const {
     for (const auto &nn : networks) unique_set.insert(nn.backend);
     return std::vector<cv::gapi::GBackend>(unique_set.begin(), unique_set.end());
 }
-
-// FIXME: Inference API is currently only available in full mode
-#if !defined(GAPI_STANDALONE)
-
-cv::GInferInputs::GInferInputs()
-    : in_blobs(std::make_shared<Map>())
-{
-}
-
-cv::GMat& cv::GInferInputs::operator[](const std::string& name) {
-    return (*in_blobs)[name];
-}
-
-const cv::GInferInputs::Map& cv::GInferInputs::getBlobs() const {
-    return *in_blobs;
-}
-
-void cv::GInferInputs::setInput(const std::string& name, const cv::GMat& value) {
-    in_blobs->emplace(name, value);
-}
-
-struct cv::GInferOutputs::Priv
-{
-    Priv(std::shared_ptr<cv::GCall>);
-
-    std::shared_ptr<cv::GCall> call;
-    InOutInfo* info = nullptr;
-    std::unordered_map<std::string, cv::GMat> out_blobs;
-};
-
-cv::GInferOutputs::Priv::Priv(std::shared_ptr<cv::GCall> c)
-    : call(std::move(c)), info(cv::util::any_cast<InOutInfo>(&call->params()))
-{
-}
-
-cv::GInferOutputs::GInferOutputs(std::shared_ptr<cv::GCall> call)
-    : m_priv(std::make_shared<cv::GInferOutputs::Priv>(std::move(call)))
-{
-}
-
-cv::GMat cv::GInferOutputs::at(const std::string& name)
-{
-    auto it = m_priv->out_blobs.find(name);
-    if (it == m_priv->out_blobs.end()) {
-        // FIXME: Avoid modifying GKernel
-        // Expect output to be always GMat
-        m_priv->call->kernel().outShapes.push_back(cv::GShape::GMAT);
-        // ...so _empty_ constructor is passed here.
-        m_priv->call->kernel().outCtors.emplace_back(cv::util::monostate{});
-        int out_idx = static_cast<int>(m_priv->out_blobs.size());
-        it = m_priv->out_blobs.emplace(name, m_priv->call->yield(out_idx)).first;
-        m_priv->info->out_names.push_back(name);
-    }
-    return it->second;
-}
-#endif // GAPI_STANDALONE
diff --git a/modules/gapi/src/api/gkernel.cpp b/modules/gapi/src/api/gkernel.cpp
index 6993e95807..2a68272a4c 100644
--- a/modules/gapi/src/api/gkernel.cpp
+++ b/modules/gapi/src/api/gkernel.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2019 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #include "precomp.hpp"
@@ -55,6 +55,16 @@ const std::vector<cv::GTransform> &cv::gapi::GKernelPackage::get_transformations
     return m_transformations;
 }
 
+std::vector<std::string> cv::gapi::GKernelPackage::get_kernel_ids() const
+{
+    std::vector<std::string> ids;
+    for (auto &&id : m_id_kernels)
+    {
+        ids.emplace_back(id.first);
+    }
+    return ids;
+}
+
 cv::gapi::GKernelPackage cv::gapi::combine(const GKernelPackage  &lhs,
                                            const GKernelPackage  &rhs)
 {
diff --git a/modules/gapi/src/api/gmat.cpp b/modules/gapi/src/api/gmat.cpp
index 08bb170a86..47a246c293 100644
--- a/modules/gapi/src/api/gmat.cpp
+++ b/modules/gapi/src/api/gmat.cpp
@@ -36,6 +36,38 @@ const cv::GOrigin& cv::GMat::priv() const
     return *m_priv;
 }
 
+static std::vector<int> checkVectorImpl(const int width, const int height, const int chan,
+                                        const int n)
+{
+    if (width == 1 && (n == -1 || n == chan))
+    {
+        return {height, chan};
+    }
+    else if (height == 1 && (n == -1 || n == chan))
+    {
+        return {width, chan};
+    }
+    else if (chan == 1 && (n == -1 || n == width))
+    {
+        return {height, width};
+    }
+    else // input Mat can't be described as vector of points of given dimensionality
+    {
+        return {-1, -1};
+    }
+}
+
+int cv::gapi::detail::checkVector(const cv::GMatDesc& in, const size_t n)
+{
+    GAPI_Assert(n != 0u);
+    return checkVectorImpl(in.size.width, in.size.height, in.chan, static_cast<int>(n))[0];
+}
+
+std::vector<int> cv::gapi::detail::checkVector(const cv::GMatDesc& in)
+{
+    return checkVectorImpl(in.size.width, in.size.height, in.chan, -1);
+}
+
 namespace{
     template <typename T> cv::GMetaArgs vec_descr_of(const std::vector<T> &vec)
         {
diff --git a/modules/gapi/src/api/gproto.cpp b/modules/gapi/src/api/gproto.cpp
index ec7674a14d..0c7c6462ee 100644
--- a/modules/gapi/src/api/gproto.cpp
+++ b/modules/gapi/src/api/gproto.cpp
@@ -146,6 +146,7 @@ cv::GMetaArg cv::descr_of(const cv::GRunArgP &argp)
 #endif //  !defined(GAPI_STANDALONE)
     case GRunArgP::index_of<cv::Mat*>():               return GMetaArg(cv::descr_of(*util::get<cv::Mat*>(argp)));
     case GRunArgP::index_of<cv::Scalar*>():            return GMetaArg(descr_of(*util::get<cv::Scalar*>(argp)));
+    case GRunArgP::index_of<cv::MediaFrame*>():        return GMetaArg(descr_of(*util::get<cv::MediaFrame*>(argp)));
     case GRunArgP::index_of<cv::detail::VectorRef>():  return GMetaArg(util::get<cv::detail::VectorRef>(argp).descr_of());
     case GRunArgP::index_of<cv::detail::OpaqueRef>():  return GMetaArg(util::get<cv::detail::OpaqueRef>(argp).descr_of());
     default: util::throw_error(std::logic_error("Unsupported GRunArgP type"));
@@ -163,6 +164,7 @@ bool cv::can_describe(const GMetaArg& meta, const GRunArgP& argp)
     case GRunArgP::index_of<cv::Mat*>():               return util::holds_alternative<GMatDesc>(meta) &&
                                                               util::get<GMatDesc>(meta).canDescribe(*util::get<cv::Mat*>(argp));
     case GRunArgP::index_of<cv::Scalar*>():            return meta == GMetaArg(cv::descr_of(*util::get<cv::Scalar*>(argp)));
+    case GRunArgP::index_of<cv::MediaFrame*>():        return meta == GMetaArg(cv::descr_of(*util::get<cv::MediaFrame*>(argp)));
     case GRunArgP::index_of<cv::detail::VectorRef>():  return meta == GMetaArg(util::get<cv::detail::VectorRef>(argp).descr_of());
     case GRunArgP::index_of<cv::detail::OpaqueRef>():  return meta == GMetaArg(util::get<cv::detail::OpaqueRef>(argp).descr_of());
     default: util::throw_error(std::logic_error("Unsupported GRunArgP type"));
@@ -288,6 +290,8 @@ const void* cv::gimpl::proto::ptr(const GRunArgP &arg)
         return cv::util::get<cv::detail::VectorRef>(arg).ptr();
     case GRunArgP::index_of<cv::detail::OpaqueRef>():
         return cv::util::get<cv::detail::OpaqueRef>(arg).ptr();
+    case GRunArgP::index_of<cv::MediaFrame*>():
+        return static_cast<const void*>(cv::util::get<cv::MediaFrame*>(arg));
     default:
         util::throw_error(std::logic_error("Unknown GRunArgP type!"));
     }
diff --git a/modules/gapi/src/api/grunarg.cpp b/modules/gapi/src/api/grunarg.cpp
index 30ae2adbc0..98ffa901ed 100644
--- a/modules/gapi/src/api/grunarg.cpp
+++ b/modules/gapi/src/api/grunarg.cpp
@@ -31,3 +31,48 @@ cv::GRunArg& cv::GRunArg::operator= (cv::GRunArg &&arg) {
     meta = std::move(arg.meta);
     return *this;
 }
+
+// NB: Construct GRunArgsP based on passed info and store the memory in passed cv::GRunArgs.
+// Needed for python bridge, because in case python user doesn't pass output arguments to apply.
+void cv::detail::constructGraphOutputs(const cv::GTypesInfo &out_info,
+                                       cv::GRunArgs         &args,
+                                       cv::GRunArgsP        &outs)
+{
+    for (auto&& info : out_info)
+    {
+        switch (info.shape)
+        {
+            case cv::GShape::GMAT:
+            {
+                args.emplace_back(cv::Mat{});
+                outs.emplace_back(&cv::util::get<cv::Mat>(args.back()));
+                break;
+            }
+            case cv::GShape::GSCALAR:
+            {
+                args.emplace_back(cv::Scalar{});
+                outs.emplace_back(&cv::util::get<cv::Scalar>(args.back()));
+                break;
+            }
+            case cv::GShape::GARRAY:
+            {
+                cv::detail::VectorRef ref;
+                util::get<cv::detail::ConstructVec>(info.ctor)(ref);
+                args.emplace_back(ref);
+                outs.emplace_back(cv::util::get<cv::detail::VectorRef>(args.back()));
+                break;
+            }
+            case cv::GShape::GOPAQUE:
+            {
+                cv::detail::OpaqueRef ref;
+                util::get<cv::detail::ConstructOpaque>(info.ctor)(ref);
+                args.emplace_back(ref);
+                outs.emplace_back(ref);
+                break;
+            }
+
+            default:
+                util::throw_error(std::logic_error("Unsupported output shape for python"));
+        }
+    }
+}
diff --git a/modules/gapi/src/api/kernels_core.cpp b/modules/gapi/src/api/kernels_core.cpp
index 82aceb1f26..3196b5db2e 100644
--- a/modules/gapi/src/api/kernels_core.cpp
+++ b/modules/gapi/src/api/kernels_core.cpp
@@ -328,11 +328,6 @@ GMat crop(const GMat& src, const Rect& rect)
     return core::GCrop::on(src, rect);
 }
 
-GMat copy(const GMat& src)
-{
-    return core::GCopy::on(src);
-}
-
 GMat concatHor(const GMat& src1, const GMat& src2)
 {
     return core::GConcatHor::on(src1, src2);
@@ -388,6 +383,40 @@ GMat warpAffine(const GMat& src, const Mat& M, const Size& dsize, int flags,
     return core::GWarpAffine::on(src, M, dsize, flags, borderMode, borderValue);
 }
 
+std::tuple<GOpaque<double>,GMat,GMat> kmeans(const GMat& data, const int K, const GMat& bestLabels,
+                                             const TermCriteria& criteria, const int attempts,
+                                             const KmeansFlags flags)
+{
+    return core::GKMeansND::on(data, K, bestLabels, criteria, attempts, flags);
+}
+
+std::tuple<GOpaque<double>,GMat,GMat> kmeans(const GMat& data, const int K,
+                                             const TermCriteria& criteria, const int attempts,
+                                             const KmeansFlags flags)
+{
+    return core::GKMeansNDNoInit::on(data, K, criteria, attempts, flags);
+}
+
+std::tuple<GOpaque<double>,GArray<int>,GArray<Point2f>> kmeans(const GArray<Point2f>& data,
+                                                               const int              K,
+                                                               const GArray<int>&     bestLabels,
+                                                               const TermCriteria&    criteria,
+                                                               const int              attempts,
+                                                               const KmeansFlags      flags)
+{
+    return core::GKMeans2D::on(data, K, bestLabels, criteria, attempts, flags);
+}
+
+std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>> kmeans(const GArray<Point3f>& data,
+                                                               const int              K,
+                                                               const GArray<int>&     bestLabels,
+                                                               const TermCriteria&    criteria,
+                                                               const int              attempts,
+                                                               const KmeansFlags      flags)
+{
+    return core::GKMeans3D::on(data, K, bestLabels, criteria, attempts, flags);
+}
+
 GOpaque<Size> streaming::size(const GMat& src)
 {
     return streaming::GSize::on(src);
@@ -398,5 +427,10 @@ GOpaque<Size> streaming::size(const GOpaque<Rect>& r)
     return streaming::GSizeR::on(r);
 }
 
+GOpaque<Size> streaming::size(const GFrame& src)
+{
+    return streaming::GSizeMF::on(src);
+}
+
 } //namespace gapi
 } //namespace cv
diff --git a/modules/gapi/src/api/kernels_stereo.cpp b/modules/gapi/src/api/kernels_stereo.cpp
new file mode 100644
index 0000000000..f336e8b910
--- /dev/null
+++ b/modules/gapi/src/api/kernels_stereo.cpp
@@ -0,0 +1,18 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include <opencv2/gapi/stereo.hpp>
+
+namespace cv { namespace gapi {
+
+GMat stereo(const GMat& left, const GMat& right,
+            const cv::gapi::StereoOutputFormat of)
+{
+    return calib3d::GStereo::on(left, right, of);
+}
+
+} // namespace cv
+} // namespace gapi
diff --git a/modules/gapi/src/api/kernels_streaming.cpp b/modules/gapi/src/api/kernels_streaming.cpp
index af7bd19dd1..c88fbede5b 100644
--- a/modules/gapi/src/api/kernels_streaming.cpp
+++ b/modules/gapi/src/api/kernels_streaming.cpp
@@ -7,6 +7,8 @@
 #include "precomp.hpp"
 
 #include <opencv2/gapi/streaming/desync.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
+
 #include <opencv2/gapi/core.hpp>
 
 cv::GMat cv::gapi::streaming::desync(const cv::GMat &g) {
@@ -72,3 +74,15 @@ cv::GMat cv::gapi::streaming::desync(const cv::GMat &g) {
     // connected to a desynchronized data object, and this sole last_written_value
     // object will feed both branches of the streaming executable.
 }
+
+cv::GMat cv::gapi::streaming::BGR(const cv::GFrame& in) {
+    return cv::gapi::streaming::GBGR::on(in);
+}
+
+cv::GMat cv::gapi::streaming::Y(const cv::GFrame& in){
+    return cv::gapi::streaming::GY::on(in);
+}
+
+cv::GMat cv::gapi::streaming::UV(const cv::GFrame& in){
+    return cv::gapi::streaming::GUV::on(in);
+}
diff --git a/modules/gapi/src/api/kernels_video.cpp b/modules/gapi/src/api/kernels_video.cpp
index eff6d48874..5eeaef2234 100644
--- a/modules/gapi/src/api/kernels_video.cpp
+++ b/modules/gapi/src/api/kernels_video.cpp
@@ -52,5 +52,68 @@ GOptFlowLKOutput calcOpticalFlowPyrLK(const cv::GArray<cv::GMat>    &prevPyr,
                                     criteria, flags, minEigThresh);
 }
 
+GMat BackgroundSubtractor(const GMat& src, const BackgroundSubtractorParams& bsp)
+{
+    return GBackgroundSubtractor::on(src, bsp);
+}
+
+GMat KalmanFilter(const GMat& m, const cv::GOpaque<bool>& have_m, const GMat& c, const KalmanParams& kp)
+{
+    return GKalmanFilter::on(m, have_m, c, kp);
+}
+
+GMat KalmanFilter(const GMat& m, const cv::GOpaque<bool>& have_m, const KalmanParams& kp)
+{
+    return GKalmanFilterNoControl::on(m, have_m, kp);
+}
+
+namespace video {
+void checkParams(const cv::gapi::KalmanParams& kfParams,
+                 const cv::GMatDesc& measurement, const cv::GMatDesc& control)
+{
+    int type = kfParams.transitionMatrix.type();
+    GAPI_Assert(type == CV_32FC1 || type == CV_64FC1);
+    int depth = CV_MAT_DEPTH(type);
+
+    bool controlCapable = !(control == GMatDesc{});
+
+    if (controlCapable)
+    {
+        GAPI_Assert(!kfParams.controlMatrix.empty());
+        GAPI_Assert(control.depth == depth && control.chan == 1 &&
+                    control.size.height == kfParams.controlMatrix.cols &&
+                    control.size.width == 1);
+    }
+    else
+        GAPI_Assert(kfParams.controlMatrix.empty());
+
+    GAPI_Assert(!kfParams.state.empty() && kfParams.state.type() == type);
+    GAPI_Assert(!kfParams.errorCov.empty() && kfParams.errorCov.type() == type);
+    GAPI_Assert(!kfParams.transitionMatrix.empty() && kfParams.transitionMatrix.type() == type);
+    GAPI_Assert(!kfParams.processNoiseCov.empty() && kfParams.processNoiseCov.type() == type);
+    GAPI_Assert(!kfParams.measurementNoiseCov.empty() && kfParams.measurementNoiseCov.type() == type);
+    GAPI_Assert(!kfParams.measurementMatrix.empty() && kfParams.measurementMatrix.type() == type);
+    GAPI_Assert(measurement.depth == depth && measurement.chan == 1);
+
+    int dDim = kfParams.transitionMatrix.cols;
+    GAPI_Assert(kfParams.transitionMatrix.rows == dDim);
+
+    GAPI_Assert(kfParams.processNoiseCov.cols == dDim &&
+                kfParams.processNoiseCov.rows == dDim);
+    GAPI_Assert(kfParams.errorCov.cols == dDim && kfParams.errorCov.rows == dDim);
+    GAPI_Assert(kfParams.state.rows == dDim && kfParams.state.cols == 1);
+    GAPI_Assert(kfParams.measurementMatrix.cols == dDim);
+
+    int mDim = kfParams.measurementMatrix.rows;
+    GAPI_Assert(kfParams.measurementNoiseCov.cols == mDim &&
+                kfParams.measurementNoiseCov.rows == mDim);
+
+    if (controlCapable)
+        GAPI_Assert(kfParams.controlMatrix.rows == dDim);
+
+    GAPI_Assert(measurement.size.height == mDim &&
+                measurement.size.width == 1);
+}
+}  // namespace video
 } //namespace gapi
 } //namespace cv
diff --git a/modules/gapi/src/api/media.cpp b/modules/gapi/src/api/media.cpp
index 212902ee3b..3c171b215f 100644
--- a/modules/gapi/src/api/media.cpp
+++ b/modules/gapi/src/api/media.cpp
@@ -26,6 +26,10 @@ cv::MediaFrame::View cv::MediaFrame::access(Access code) const {
     return m->adapter->access(code);
 }
 
+cv::MediaFrame::IAdapter* cv::MediaFrame::getAdapter() const {
+    return m->adapter.get();
+}
+
 cv::MediaFrame::View::View(Ptrs&& ptrs, Strides&& strs, Callback &&cb)
     : ptr   (std::move(ptrs))
     , stride(std::move(strs))
diff --git a/modules/gapi/src/api/render.cpp b/modules/gapi/src/api/render.cpp
index 428ebdabc5..cf0dac022c 100644
--- a/modules/gapi/src/api/render.cpp
+++ b/modules/gapi/src/api/render.cpp
@@ -33,6 +33,20 @@ void cv::gapi::wip::draw::render(cv::Mat& y_plane,
                cv::gout(y_plane, uv_plane), std::move(args));
 }
 
+void cv::gapi::wip::draw::render(cv::MediaFrame& frame,
+                                 const Prims& prims,
+                                 cv::GCompileArgs&& args)
+{
+    cv::GFrame in, out;
+    cv::GArray<cv::gapi::wip::draw::Prim> arr;
+    out = cv::gapi::wip::draw::renderFrame(in, arr);
+
+    cv::GComputation comp(cv::GIn(in, arr), cv::GOut(out));
+    comp.apply(cv::gin(frame, prims),
+               cv::gout(frame), std::move(args));
+}
+
+
 void cv::gapi::wip::draw::cvtYUVToNV12(const cv::Mat& yuv,
                                        cv::Mat& y,
                                        cv::Mat& uv)
@@ -69,3 +83,9 @@ cv::gapi::wip::draw::renderNV12(const cv::GMat& y,
 {
     return cv::gapi::wip::draw::GRenderNV12::on(y, uv, prims);
 }
+
+cv::GFrame cv::gapi::wip::draw::renderFrame(const cv::GFrame& frame,
+                                                const cv::GArray<cv::gapi::wip::draw::Prim>& prims)
+{
+    return cv::gapi::wip::draw::GRenderFrame::on(frame, prims);
+}
diff --git a/modules/gapi/src/api/s11n.cpp b/modules/gapi/src/api/s11n.cpp
index b6acf28ea4..d08f47fd26 100644
--- a/modules/gapi/src/api/s11n.cpp
+++ b/modules/gapi/src/api/s11n.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #include <opencv2/gapi/s11n.hpp>
 #include <opencv2/gapi/garg.hpp>
@@ -30,6 +30,11 @@ cv::GRunArgs cv::gapi::detail::getRunArgs(const std::vector<char> &p) {
     return run_args_deserialize(is);
 }
 
+std::vector<std::string> cv::gapi::detail::getVectorOfStrings(const std::vector<char> &p) {
+    cv::gapi::s11n::ByteMemoryInStream is(p);
+    return vector_of_strings_deserialize(is);
+}
+
 std::vector<char> cv::gapi::serialize(const cv::GMetaArgs& ma)
 {
     cv::gapi::s11n::ByteMemoryOutStream os;
@@ -51,6 +56,13 @@ std::vector<char> cv::gapi::serialize(const cv::GCompileArgs& ca)
     return os.data();
 }
 
+std::vector<char> cv::gapi::serialize(const std::vector<std::string>& vs)
+{
+    cv::gapi::s11n::ByteMemoryOutStream os;
+    serialize(os, vs);
+    return os.data();
+}
+
 // FIXME: This function should move from S11N to GRunArg-related entities.
 // it has nothing to do with the S11N as it is
 cv::GRunArgsP cv::gapi::bind(cv::GRunArgs &results)
diff --git a/modules/gapi/src/backends/common/gbackend.hpp b/modules/gapi/src/backends/common/gbackend.hpp
index 576168db53..b22ec5e177 100644
--- a/modules/gapi/src/backends/common/gbackend.hpp
+++ b/modules/gapi/src/backends/common/gbackend.hpp
@@ -58,6 +58,54 @@ namespace gimpl {
     struct Data;
     struct RcDesc;
 
+    struct GAPI_EXPORTS RMatMediaFrameAdapter final: public cv::RMat::Adapter
+    {
+        using MapDescF = std::function<cv::GMatDesc(const GFrameDesc&)>;
+        using MapDataF = std::function<cv::Mat(const GFrameDesc&, const cv::MediaFrame::View&)>;
+
+        RMatMediaFrameAdapter(const cv::MediaFrame& frame,
+                              const MapDescF& frameDescToMatDesc,
+                              const MapDataF& frameViewToMat) :
+            m_frame(frame),
+            m_frameDesc(frame.desc()),
+            m_frameDescToMatDesc(frameDescToMatDesc),
+            m_frameViewToMat(frameViewToMat)
+        { }
+
+        virtual cv::RMat::View access(cv::RMat::Access a) override
+        {
+            auto rmatToFrameAccess = [](cv::RMat::Access rmatAccess) {
+                switch(rmatAccess) {
+                    case cv::RMat::Access::R:
+                        return cv::MediaFrame::Access::R;
+                    case cv::RMat::Access::W:
+                        return cv::MediaFrame::Access::W;
+                    default:
+                        cv::util::throw_error(std::logic_error("cv::RMat::Access::R or "
+                            "cv::RMat::Access::W can only be mapped to cv::MediaFrame::Access!"));
+                }
+            };
+
+            auto fv = m_frame.access(rmatToFrameAccess(a));
+
+            auto fvHolder = std::make_shared<cv::MediaFrame::View>(std::move(fv));
+            auto callback = [fvHolder]() mutable { fvHolder.reset(); };
+
+            return asView(m_frameViewToMat(m_frame.desc(), *fvHolder), callback);
+        }
+
+        virtual cv::GMatDesc desc() const override
+        {
+            return m_frameDescToMatDesc(m_frameDesc);
+        }
+
+        cv::MediaFrame m_frame;
+        cv::GFrameDesc m_frameDesc;
+        MapDescF m_frameDescToMatDesc;
+        MapDataF m_frameViewToMat;
+    };
+
+
 namespace magazine {
     template<typename... Ts> struct Class
     {
@@ -161,6 +209,12 @@ inline cv::util::optional<T> getCompileArg(const cv::GCompileArgs &args)
 
 void GAPI_EXPORTS createMat(const cv::GMatDesc& desc, cv::Mat& mat);
 
+inline void convertInt64ToInt32(const int64_t* src, int* dst, size_t size)
+{
+    std::transform(src, src + size, dst,
+                   [](int64_t el) { return static_cast<int>(el); });
+}
+
 }} // cv::gimpl
 
 #endif // OPENCV_GAPI_GBACKEND_HPP
diff --git a/modules/gapi/src/backends/common/gcompoundkernel.cpp b/modules/gapi/src/backends/common/gcompoundkernel.cpp
index 8983a3a2c9..20467117b2 100644
--- a/modules/gapi/src/backends/common/gcompoundkernel.cpp
+++ b/modules/gapi/src/backends/common/gcompoundkernel.cpp
@@ -16,7 +16,7 @@
 cv::detail::GCompoundContext::GCompoundContext(const cv::GArgs& in_args)
 {
     m_args.resize(in_args.size());
-    for (const auto& it : ade::util::indexed(in_args))
+    for (const auto it : ade::util::indexed(in_args))
     {
         const auto& i      = ade::util::index(it);
         const auto& in_arg = ade::util::value(it);
diff --git a/modules/gapi/src/backends/common/serialization.cpp b/modules/gapi/src/backends/common/serialization.cpp
index 8c2313b292..7389bacb02 100644
--- a/modules/gapi/src/backends/common/serialization.cpp
+++ b/modules/gapi/src/backends/common/serialization.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #include <set> // set
 #include <map> // map
@@ -71,7 +71,7 @@ void linkNodes(ade::Graph& g) {
     for (const auto& nh : g.nodes()) {
         if (gm.metadata(nh).get<cv::gimpl::NodeType>().t == cv::gimpl::NodeType::OP) {
             const auto& op = gm.metadata(nh).get<gimpl::Op>();
-            for (const auto& in : ade::util::indexed(op.args)) {
+            for (const auto in : ade::util::indexed(op.args)) {
                 const auto& arg = ade::util::value(in);
                 if (arg.kind == cv::detail::ArgKind::GOBJREF) {
                     const auto idx = ade::util::index(in);
@@ -82,9 +82,9 @@ void linkNodes(ade::Graph& g) {
                 }
             }
 
-            for (const auto& out : ade::util::indexed(op.outs)) {
-                const auto idx = ade::util::index(out);
-                const auto rc  = ade::util::value(out);
+            for (const auto out : ade::util::indexed(op.outs)) {
+                const auto  idx = ade::util::index(out);
+                const auto& rc  = ade::util::value(out);
                 const auto& out_nh = dataNodes.at(rc);
                 const auto& out_eh = g.link(nh, out_nh);
                 gm.metadata(out_eh).set(cv::gimpl::Output{idx});
@@ -906,6 +906,9 @@ GAPI_EXPORTS void serialize(IOStream& os, const cv::GMetaArgs &ma) {
 GAPI_EXPORTS void serialize(IOStream& os, const cv::GRunArgs &ra) {
     os << ra;
 }
+GAPI_EXPORTS void serialize(IOStream& os, const std::vector<std::string> &vs) {
+    os << vs;
+}
 GAPI_EXPORTS GMetaArgs meta_args_deserialize(IIStream& is) {
     GMetaArgs s;
     is >> s;
@@ -916,6 +919,11 @@ GAPI_EXPORTS GRunArgs run_args_deserialize(IIStream& is) {
     is >> s;
     return s;
 }
+GAPI_EXPORTS std::vector<std::string> vector_of_strings_deserialize(IIStream& is) {
+    std::vector<std::string> s;
+    is >> s;
+    return s;
+}
 
 } // namespace s11n
 } // namespace gapi
diff --git a/modules/gapi/src/backends/common/serialization.hpp b/modules/gapi/src/backends/common/serialization.hpp
index a3134d84d2..b4204ca64e 100644
--- a/modules/gapi/src/backends/common/serialization.hpp
+++ b/modules/gapi/src/backends/common/serialization.hpp
@@ -5,7 +5,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #include <iostream>
 #include <fstream>
@@ -217,8 +217,10 @@ GAPI_EXPORTS std::unique_ptr<IIStream> getInStream(const std::vector<char> &p);
 GAPI_EXPORTS void serialize(IOStream& os, const cv::GCompileArgs &ca);
 GAPI_EXPORTS void serialize(IOStream& os, const cv::GMetaArgs &ma);
 GAPI_EXPORTS void serialize(IOStream& os, const cv::GRunArgs &ra);
+GAPI_EXPORTS void serialize(IOStream& os, const std::vector<std::string> &vs);
 GAPI_EXPORTS GMetaArgs meta_args_deserialize(IIStream& is);
 GAPI_EXPORTS GRunArgs run_args_deserialize(IIStream& is);
+GAPI_EXPORTS std::vector<std::string> vector_of_strings_deserialize(IIStream& is);
 
 } // namespace s11n
 } // namespace gapi
diff --git a/modules/gapi/src/backends/cpu/gcpubackend.cpp b/modules/gapi/src/backends/cpu/gcpubackend.cpp
index 5e2540365b..cf4b087f92 100644
--- a/modules/gapi/src/backends/cpu/gcpubackend.cpp
+++ b/modules/gapi/src/backends/cpu/gcpubackend.cpp
@@ -237,11 +237,11 @@ void cv::gimpl::GCPUExecutable::run(std::vector<InObj>  &&input_objs,
 
         // - Output parameters.
         // FIXME: pre-allocate internal Mats, etc, according to the known meta
-        for (const auto &out_it : ade::util::indexed(op.outs))
+        for (const auto out_it : ade::util::indexed(op.outs))
         {
             // FIXME: Can the same GArg type resolution mechanism be reused here?
-            const auto out_port  = ade::util::index(out_it);
-            const auto out_desc  = ade::util::value(out_it);
+            const auto  out_port  = ade::util::index(out_it);
+            const auto& out_desc  = ade::util::value(out_it);
             context.m_results[out_port] = magazine::getObjPtr(m_res, out_desc);
         }
 
@@ -259,10 +259,10 @@ void cv::gimpl::GCPUExecutable::run(std::vector<InObj>  &&input_objs,
         //FIXME: unify with cv::detail::ensure_out_mats_not_reallocated
         //FIXME: when it's done, remove can_describe(const GMetaArg&, const GRunArgP&)
         //and descr_of(const cv::GRunArgP &argp)
-        for (const auto &out_it : ade::util::indexed(op_info.expected_out_metas))
+        for (const auto out_it : ade::util::indexed(op_info.expected_out_metas))
         {
-            const auto out_index      = ade::util::index(out_it);
-            const auto expected_meta  = ade::util::value(out_it);
+            const auto  out_index      = ade::util::index(out_it);
+            const auto& expected_meta  = ade::util::value(out_it);
 
             if (!can_describe(expected_meta, context.m_results[out_index]))
             {
diff --git a/modules/gapi/src/backends/cpu/gcpucore.cpp b/modules/gapi/src/backends/cpu/gcpucore.cpp
index fc460149c6..b0bce410f7 100644
--- a/modules/gapi/src/backends/cpu/gcpucore.cpp
+++ b/modules/gapi/src/backends/cpu/gcpucore.cpp
@@ -510,14 +510,6 @@ GAPI_OCV_KERNEL(GCPUCrop, cv::gapi::core::GCrop)
     }
 };
 
-GAPI_OCV_KERNEL(GCPUCopy, cv::gapi::core::GCopy)
-{
-    static void run(const cv::Mat& in, cv::Mat& out)
-    {
-        in.copyTo(out);
-    }
-};
-
 GAPI_OCV_KERNEL(GCPUConcatHor, cv::gapi::core::GConcatHor)
 {
     static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
@@ -585,6 +577,63 @@ GAPI_OCV_KERNEL(GCPUWarpAffine, cv::gapi::core::GWarpAffine)
     }
 };
 
+GAPI_OCV_KERNEL(GCPUKMeansND, cv::gapi::core::GKMeansND)
+{
+    static void run(const cv::Mat& data, const int K, const cv::Mat& inBestLabels,
+                    const cv::TermCriteria& criteria, const int attempts,
+                    const cv::KmeansFlags flags,
+                    double& compactness, cv::Mat& outBestLabels, cv::Mat& centers)
+    {
+        if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+        {
+            inBestLabels.copyTo(outBestLabels);
+        }
+        compactness = cv::kmeans(data, K, outBestLabels, criteria, attempts, flags, centers);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUKMeansNDNoInit, cv::gapi::core::GKMeansNDNoInit)
+{
+    static void run(const cv::Mat& data, const int K, const cv::TermCriteria& criteria,
+                    const int attempts, const cv::KmeansFlags flags,
+                    double& compactness, cv::Mat& outBestLabels, cv::Mat& centers)
+    {
+        compactness = cv::kmeans(data, K, outBestLabels, criteria, attempts, flags, centers);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUKMeans2D, cv::gapi::core::GKMeans2D)
+{
+    static void run(const std::vector<cv::Point2f>& data, const int K,
+                    const std::vector<int>& inBestLabels, const cv::TermCriteria& criteria,
+                    const int attempts, const cv::KmeansFlags flags,
+                    double& compactness, std::vector<int>& outBestLabels,
+                    std::vector<cv::Point2f>& centers)
+    {
+        if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+        {
+            outBestLabels = inBestLabels;
+        }
+        compactness = cv::kmeans(data, K, outBestLabels, criteria, attempts, flags, centers);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUKMeans3D, cv::gapi::core::GKMeans3D)
+{
+    static void run(const std::vector<cv::Point3f>& data, const int K,
+                    const std::vector<int>& inBestLabels, const cv::TermCriteria& criteria,
+                    const int attempts, const cv::KmeansFlags flags,
+                    double& compactness, std::vector<int>& outBestLabels,
+                    std::vector<cv::Point3f>& centers)
+    {
+        if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+        {
+            outBestLabels = inBestLabels;
+        }
+        compactness = cv::kmeans(data, K, outBestLabels, criteria, attempts, flags, centers);
+    }
+};
+
 GAPI_OCV_KERNEL(GCPUParseSSDBL, cv::gapi::nn::parsers::GParseSSDBL)
 {
     static void run(const cv::Mat&  in_ssd_result,
@@ -643,6 +692,14 @@ GAPI_OCV_KERNEL(GCPUSizeR, cv::gapi::streaming::GSizeR)
     }
 };
 
+GAPI_OCV_KERNEL(GCPUSizeMF, cv::gapi::streaming::GSizeMF)
+{
+    static void run(const cv::MediaFrame& in, cv::Size& out)
+    {
+        out = in.desc().size;
+    }
+};
+
 cv::gapi::GKernelPackage cv::gapi::core::cpu::kernels()
 {
     static auto pkg = cv::gapi::kernels
@@ -705,7 +762,6 @@ cv::gapi::GKernelPackage cv::gapi::core::cpu::kernels()
          , GCPURemap
          , GCPUFlip
          , GCPUCrop
-         , GCPUCopy
          , GCPUConcatHor
          , GCPUConcatVert
          , GCPULUT
@@ -714,11 +770,16 @@ cv::gapi::GKernelPackage cv::gapi::core::cpu::kernels()
          , GCPUNormalize
          , GCPUWarpPerspective
          , GCPUWarpAffine
+         , GCPUKMeansND
+         , GCPUKMeansNDNoInit
+         , GCPUKMeans2D
+         , GCPUKMeans3D
          , GCPUParseSSDBL
          , GOCVParseSSD
          , GCPUParseYolo
          , GCPUSize
          , GCPUSizeR
-         >();
+         , GCPUSizeMF
+        >();
     return pkg;
 }
diff --git a/modules/gapi/src/backends/cpu/gcpukernel.cpp b/modules/gapi/src/backends/cpu/gcpukernel.cpp
index 0d8d7379b6..4497952c87 100644
--- a/modules/gapi/src/backends/cpu/gcpukernel.cpp
+++ b/modules/gapi/src/backends/cpu/gcpukernel.cpp
@@ -41,6 +41,11 @@ cv::detail::OpaqueRef& cv::GCPUContext::outOpaqueRef(int output)
     return util::get<cv::detail::OpaqueRef>(m_results.at(output));
 }
 
+cv::MediaFrame& cv::GCPUContext::outFrame(int output)
+{
+    return *util::get<cv::MediaFrame*>(m_results.at(output));
+}
+
 cv::GCPUKernel::GCPUKernel()
 {
 }
diff --git a/modules/gapi/src/backends/cpu/gcpustereo.cpp b/modules/gapi/src/backends/cpu/gcpustereo.cpp
new file mode 100644
index 0000000000..5dd03e041e
--- /dev/null
+++ b/modules/gapi/src/backends/cpu/gcpustereo.cpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include <opencv2/gapi/stereo.hpp>
+#include <opencv2/gapi/cpu/stereo.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+
+#ifdef HAVE_OPENCV_STEREO
+#include <opencv2/stereo.hpp>
+#endif // HAVE_OPENCV_STEREO
+
+#ifdef HAVE_OPENCV_STEREO
+
+/** @brief Structure for the Stereo operation setup parameters.*/
+struct GAPI_EXPORTS StereoSetup {
+    double baseline;
+    double focus;
+    cv::Ptr<cv::StereoBM> stereoBM;
+};
+
+namespace {
+cv::Mat calcDepth(const cv::Mat &left, const cv::Mat &right,
+                  const StereoSetup &ss) {
+    constexpr int DISPARITY_SHIFT_16S = 4;
+    cv::Mat disp;
+    ss.stereoBM->compute(left, right, disp);
+    disp.convertTo(disp, CV_32FC1, 1./(1 << DISPARITY_SHIFT_16S), 0);
+    return (ss.focus * ss.baseline) / disp;
+}
+} // anonymous namespace
+
+GAPI_OCV_KERNEL_ST(GCPUStereo, cv::gapi::calib3d::GStereo, StereoSetup)
+{
+    static void setup(const cv::GMatDesc&, const cv::GMatDesc&,
+                      const cv::gapi::StereoOutputFormat,
+                      std::shared_ptr<StereoSetup> &stereoSetup,
+                      const cv::GCompileArgs &compileArgs) {
+        auto stereoInit = cv::gapi::getCompileArg<cv::gapi::calib3d::cpu::StereoInitParam>(compileArgs)
+            .value_or(cv::gapi::calib3d::cpu::StereoInitParam{});
+
+        StereoSetup ss{stereoInit.baseline,
+                       stereoInit.focus,
+                       cv::StereoBM::create(stereoInit.numDisparities,
+                       stereoInit.blockSize)};
+        stereoSetup = std::make_shared<StereoSetup>(ss);
+    }
+    static void run(const cv::Mat& left,
+                    const cv::Mat& right,
+                    const cv::gapi::StereoOutputFormat oF,
+                    cv::Mat& out_mat,
+                    const StereoSetup &stereoSetup) {
+        switch(oF){
+            case cv::gapi::StereoOutputFormat::DEPTH_FLOAT16:
+                calcDepth(left, right, stereoSetup).convertTo(out_mat, CV_16FC1);
+                break;
+            case cv::gapi::StereoOutputFormat::DEPTH_FLOAT32:
+                calcDepth(left, right, stereoSetup).copyTo(out_mat);
+                break;
+            case cv::gapi::StereoOutputFormat::DISPARITY_FIXED16_12_4:
+                stereoSetup.stereoBM->compute(left, right, out_mat);
+                break;
+            case cv::gapi::StereoOutputFormat::DISPARITY_FIXED16_11_5:
+                GAPI_Assert(false && "This case may be supported in future.");
+            default:
+                GAPI_Assert(false && "Unknown output format!");
+        }
+    }
+};
+
+cv::gapi::GKernelPackage cv::gapi::calib3d::cpu::kernels() {
+    static auto pkg = cv::gapi::kernels<GCPUStereo>();
+    return pkg;
+}
+
+#else
+
+cv::gapi::GKernelPackage cv::gapi::calib3d::cpu::kernels()
+{
+    return GKernelPackage();
+}
+
+#endif // HAVE_OPENCV_STEREO
diff --git a/modules/gapi/src/backends/cpu/gcpuvideo.cpp b/modules/gapi/src/backends/cpu/gcpuvideo.cpp
index ac8e9e4003..cf64c240d0 100644
--- a/modules/gapi/src/backends/cpu/gcpuvideo.cpp
+++ b/modules/gapi/src/backends/cpu/gcpuvideo.cpp
@@ -80,12 +80,109 @@ GAPI_OCV_KERNEL(GCPUCalcOptFlowLKForPyr, cv::gapi::video::GCalcOptFlowLKForPyr)
     }
 };
 
+GAPI_OCV_KERNEL_ST(GCPUBackgroundSubtractor,
+                   cv::gapi::video::GBackgroundSubtractor,
+                   cv::BackgroundSubtractor)
+{
+    static void setup(const cv::GMatDesc&, const cv::gapi::video::BackgroundSubtractorParams& bsParams,
+                      std::shared_ptr<cv::BackgroundSubtractor>& state,
+                      const cv::GCompileArgs&)
+    {
+        if (bsParams.operation == cv::gapi::video::TYPE_BS_MOG2)
+            state = cv::createBackgroundSubtractorMOG2(bsParams.history,
+                                                       bsParams.threshold,
+                                                       bsParams.detectShadows);
+        else if (bsParams.operation == cv::gapi::video::TYPE_BS_KNN)
+            state = cv::createBackgroundSubtractorKNN(bsParams.history,
+                                                      bsParams.threshold,
+                                                      bsParams.detectShadows);
+
+        GAPI_Assert(state);
+    }
+
+    static void run(const cv::Mat& in, const cv::gapi::video::BackgroundSubtractorParams& bsParams,
+                    cv::Mat &out, cv::BackgroundSubtractor& state)
+    {
+        state.apply(in, out, bsParams.learningRate);
+    }
+};
+
+GAPI_OCV_KERNEL_ST(GCPUKalmanFilter, cv::gapi::video::GKalmanFilter, cv::KalmanFilter)
+{
+    static void setup(const cv::GMatDesc&, const cv::GOpaqueDesc&,
+                      const cv::GMatDesc&, const cv::gapi::KalmanParams& kfParams,
+                      std::shared_ptr<cv::KalmanFilter> &state, const cv::GCompileArgs&)
+    {
+        state = std::make_shared<cv::KalmanFilter>(kfParams.transitionMatrix.rows, kfParams.measurementMatrix.rows,
+                                                   kfParams.controlMatrix.cols, kfParams.transitionMatrix.type());
+
+        // initial state
+        kfParams.state.copyTo(state->statePost);
+        kfParams.errorCov.copyTo(state->errorCovPost);
+
+        // dynamic system initialization
+        kfParams.controlMatrix.copyTo(state->controlMatrix);
+        kfParams.measurementMatrix.copyTo(state->measurementMatrix);
+        kfParams.transitionMatrix.copyTo(state->transitionMatrix);
+        kfParams.processNoiseCov.copyTo(state->processNoiseCov);
+        kfParams.measurementNoiseCov.copyTo(state->measurementNoiseCov);
+    }
+
+    static void run(const cv::Mat& measurements, bool haveMeasurement,
+                    const cv::Mat& control, const cv::gapi::KalmanParams&,
+                    cv::Mat &out, cv::KalmanFilter& state)
+    {
+        cv::Mat pre = state.predict(control);
+
+        if (haveMeasurement)
+            state.correct(measurements).copyTo(out);
+        else
+            pre.copyTo(out);
+    }
+};
+
+GAPI_OCV_KERNEL_ST(GCPUKalmanFilterNoControl, cv::gapi::video::GKalmanFilterNoControl, cv::KalmanFilter)
+{
+    static void setup(const cv::GMatDesc&, const cv::GOpaqueDesc&,
+                      const cv::gapi::KalmanParams& kfParams,
+                      std::shared_ptr<cv::KalmanFilter> &state,
+                      const cv::GCompileArgs&)
+    {
+        state = std::make_shared<cv::KalmanFilter>(kfParams.transitionMatrix.rows, kfParams.measurementMatrix.rows,
+                                                   0, kfParams.transitionMatrix.type());
+        // initial state
+        kfParams.state.copyTo(state->statePost);
+        kfParams.errorCov.copyTo(state->errorCovPost);
+
+        // dynamic system initialization
+        kfParams.measurementMatrix.copyTo(state->measurementMatrix);
+        kfParams.transitionMatrix.copyTo(state->transitionMatrix);
+        kfParams.processNoiseCov.copyTo(state->processNoiseCov);
+        kfParams.measurementNoiseCov.copyTo(state->measurementNoiseCov);
+    }
+
+    static void run(const cv::Mat& measurements, bool haveMeasurement,
+                    const cv::gapi::KalmanParams&, cv::Mat &out,
+                    cv::KalmanFilter& state)
+    {
+        cv::Mat pre = state.predict();
+
+        if (haveMeasurement)
+            state.correct(measurements).copyTo(out);
+        else
+            pre.copyTo(out);
+    }
+};
+
 cv::gapi::GKernelPackage cv::gapi::video::cpu::kernels()
 {
     static auto pkg = cv::gapi::kernels
         < GCPUBuildOptFlowPyramid
         , GCPUCalcOptFlowLK
         , GCPUCalcOptFlowLKForPyr
+        , GCPUBackgroundSubtractor
+        , GCPUKalmanFilter
+        , GCPUKalmanFilterNoControl
         >();
     return pkg;
 }
diff --git a/modules/gapi/src/backends/fluid/gfluidbackend.cpp b/modules/gapi/src/backends/fluid/gfluidbackend.cpp
index 030bb10198..0e33ca9c0f 100644
--- a/modules/gapi/src/backends/fluid/gfluidbackend.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidbackend.cpp
@@ -232,7 +232,7 @@ void cv::gimpl::FluidAgent::reset()
 {
     m_producedLines = 0;
 
-    for (const auto& it : ade::util::indexed(in_views))
+    for (const auto it : ade::util::indexed(in_views))
     {
         auto& v = ade::util::value(it);
         if (v)
@@ -505,7 +505,7 @@ void cv::gimpl::FluidAgent::doWork()
 
     k.m_f(in_args, out_buffers);
 
-    for (const auto& it : ade::util::indexed(in_views))
+    for (const auto it : ade::util::indexed(in_views))
     {
         auto& in_view = ade::util::value(it);
 
@@ -517,7 +517,7 @@ void cv::gimpl::FluidAgent::doWork()
         };
     }
 
-    for (auto out_buf : out_buffers)
+    for (auto* out_buf : out_buffers)
     {
         out_buf->priv().writeDone();
         // FIXME WARNING: Scratch buffers rotated here too!
@@ -571,10 +571,10 @@ void cv::gimpl::GFluidExecutable::initBufferRois(std::vector<int>& readStarts,
     }
 
     // First, initialize rois for output nodes, add them to traversal stack
-    for (const auto& it : ade::util::indexed(proto.out_nhs))
+    for (const auto it : ade::util::indexed(proto.out_nhs))
     {
         const auto idx = ade::util::index(it);
-        const auto nh  = ade::util::value(it);
+        const auto& nh  = ade::util::value(it);
 
         const auto &d  = m_gm.metadata(nh).get<Data>();
 
@@ -927,7 +927,7 @@ std::size_t cv::gimpl::GFluidExecutable::total_buffers_size() const
 {
     GConstFluidModel fg(m_g);
     std::size_t total_size = 0;
-    for (const auto &i : ade::util::indexed(m_buffers))
+    for (const auto i : ade::util::indexed(m_buffers))
     {
         // Check that all internal and scratch buffers are allocated
         const auto  idx = ade::util::index(i);
@@ -1310,7 +1310,7 @@ void cv::gimpl::GFluidExecutable::run(std::vector<InObj>  &input_objs,
         agent->reset();
         // Pass input cv::Scalar's to agent argument
         const auto& op = m_gm.metadata(agent->op_handle).get<Op>();
-        for (const auto& it : ade::util::indexed(op.args))
+        for (const auto it : ade::util::indexed(op.args))
         {
             const auto& arg = ade::util::value(it);
             packArg(agent->in_args[ade::util::index(it)], arg);
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index edc91f0179..e1e9332d5e 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -97,6 +97,132 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 // Fluid kernels: addWeighted
 //
 //---------------------------
+#if CV_SIMD
+CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
+{
+    return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
+}
+
+CV_ALWAYS_INLINE v_float32 v_load_f32(const short* in)
+{
+    return v_cvt_f32(vx_load_expand(in));
+}
+
+CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in)
+{
+    return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
+}
+#endif
+
+#if CV_SSE2
+CV_ALWAYS_INLINE void addw_short_store(short* out, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out, v_pack(c1, c2));
+}
+
+CV_ALWAYS_INLINE void addw_short_store(ushort* out, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out, v_pack_u(c1, c2));
+}
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
+                               const float _alpha, const float _beta,
+                               const float _gamma, int length)
+{
+    static_assert(((std::is_same<SRC, ushort>::value) && (std::is_same<DST, ushort>::value)) ||
+                  ((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
+                  "This templated overload is only for short and ushort type combinations.");
+
+    constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
+                                                                static_cast<int>(v_int16::nlanes);
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 alpha = vx_setall_f32(_alpha);
+    v_float32 beta = vx_setall_f32(_beta);
+    v_float32 gamma = vx_setall_f32(_gamma);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(&in1[x]);
+            v_float32 a2 = v_load_f32(&in1[x + nlanes / 2]);
+            v_float32 b1 = v_load_f32(&in2[x]);
+            v_float32 b2 = v_load_f32(&in2[x + nlanes / 2]);
+
+            addw_short_store(&out[x], v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
+                                      v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+template<typename SRC>
+CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
+                               const float _alpha, const float _beta,
+                               const float _gamma, int length)
+{
+    constexpr int nlanes = v_uint8::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 alpha = vx_setall_f32(_alpha);
+    v_float32 beta = vx_setall_f32(_beta);
+    v_float32 gamma = vx_setall_f32(_gamma);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(&in1[x]);
+            v_float32 a2 = v_load_f32(&in1[x + nlanes / 4]);
+            v_float32 a3 = v_load_f32(&in1[x + nlanes / 2]);
+            v_float32 a4 = v_load_f32(&in1[x + 3 * nlanes / 4]);
+            v_float32 b1 = v_load_f32(&in2[x]);
+            v_float32 b2 = v_load_f32(&in2[x + nlanes / 4]);
+            v_float32 b3 = v_load_f32(&in2[x + nlanes / 2]);
+            v_float32 b4 = v_load_f32(&in2[x + 3 * nlanes / 4]);
+
+            v_int32 sum1 = v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
+                    sum2 = v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))),
+                    sum3 = v_round(v_fma(a3, alpha, v_fma(b3, beta, gamma))),
+                    sum4 = v_round(v_fma(a4, alpha, v_fma(b4, beta, gamma)));
+
+            vx_store(&out[x], v_pack_u(v_pack(sum1, sum2), v_pack(sum3, sum4)));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+template<typename SRC>
+CV_ALWAYS_INLINE int addw_simd(const SRC*, const SRC*, float*,
+                               const float, const float,
+                               const float, int)
+{
+    //Cases when dst type is float are successfully vectorized with compiler.
+    return 0;
+}
+#endif  // CV_SSE2
 
 template<typename DST, typename SRC1, typename SRC2>
 static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
@@ -117,8 +243,13 @@ static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
     auto _beta  = static_cast<float>( beta  );
     auto _gamma = static_cast<float>( gamma );
 
-    for (int l=0; l < length; l++)
-        out[l] = addWeighted<DST>(in1[l], in2[l], _alpha, _beta, _gamma);
+    int x = 0;
+#if CV_SSE2
+    x = addw_simd(in1, in2, out, _alpha, _beta, _gamma, length);
+#endif
+
+    for (; x < length; ++x)
+        out[x] = addWeighted<DST>(in1[x], in2[x], _alpha, _beta, _gamma);
 }
 
 GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
@@ -843,6 +974,262 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
         CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
 }
 
+#if CV_SIMD
+CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out_ptr, v_pack(c1, c2));
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out_ptr, v_pack_u(c1, c2));
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[],
+                                          const v_float32& s, const int length)
+{
+    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
+                  "This templated overload is only for short or ushort type combinations.");
+
+    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
+                                                              static_cast<int>(v_int16::nlanes);
+    if (length < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
+
+            absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)),
+                                                 v_round(v_absdiff(a2, s)));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<>
+CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
+                                                 const v_float32& s, const int length)
+{
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+
+    if (length < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 4);
+            v_float32 a3 = v_load_f32(in + x + nlanes / 2);
+            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4);
+
+            vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)),
+                                              v_round(v_absdiff(a2, s))),
+                                       v_pack(v_round(v_absdiff(a3, s)),
+                                              v_round(v_absdiff(a4, s)))));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1,
+                                              const v_int32& c2, const v_int32& c3,
+                                              const v_int32& c4, const v_int32& c5,
+                                              const v_int32& c6)
+{
+    constexpr int nlanes = static_cast<int>(v_int16::nlanes);
+    vx_store(out_ptr, v_pack(c1, c2));
+    vx_store(out_ptr + nlanes, v_pack(c3, c4));
+    vx_store(out_ptr + 2*nlanes, v_pack(c5, c6));
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1,
+                                              const v_int32& c2, const v_int32& c3,
+                                              const v_int32& c4, const v_int32& c5,
+                                              const v_int32& c6)
+{
+    constexpr int nlanes = static_cast<int>(v_uint16::nlanes);
+    vx_store(out_ptr, v_pack_u(c1, c2));
+    vx_store(out_ptr + nlanes, v_pack_u(c3, c4));
+    vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6));
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[],
+                                           const v_float32& s1, const v_float32& s2,
+                                           const v_float32& s3, const int length)
+{
+    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
+                  "This templated overload is only for short or ushort type combinations.");
+
+    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes):
+                                                              static_cast<int>(v_int16::nlanes);
+
+    if (length < 3 * nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
+            v_float32 a3 = v_load_f32(in + x + nlanes);
+            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2);
+            v_float32 a5 = v_load_f32(in + x + 2 * nlanes);
+            v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2);
+
+            absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)),
+                                             v_round(v_absdiff(a2, s2)),
+                                             v_round(v_absdiff(a3, s3)),
+                                             v_round(v_absdiff(a4, s1)),
+                                             v_round(v_absdiff(a5, s2)),
+                                             v_round(v_absdiff(a6, s3)));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - 3 * nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<>
+CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
+                                                  const v_float32& s1, const v_float32& s2,
+                                                  const v_float32& s3, const int length)
+{
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+
+    if (length < 3 * nlanes)
+        return 0;
+
+    int x = 0;
+
+    for (;;)
+    {
+        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
+        {
+            vx_store(&out[x],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)),
+                                     v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1)))));
+
+            vx_store(&out[x + nlanes],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2)))));
+
+            vx_store(&out[x + 2 * nlanes],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3)))));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - 3 * nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[],
+                                            const int width, int chan)
+{
+    int length = width * chan;
+    v_float32 s = vx_load(scalar);
+
+    return absdiffc_simd_c1c2c4(in, out, s, length);
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width)
+{
+    constexpr int chan = 3;
+    int length = width * chan;
+
+    v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+    v_float32 s2 = vx_load(scalar + 2);
+    v_float32 s3 = vx_load(scalar + 1);
+#else
+    v_float32 s2 = vx_load(scalar + 1);
+    v_float32 s3 = vx_load(scalar + 2);
+#endif
+
+    return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length);
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan)
+{
+    switch (chan)
+    {
+    case 1:
+    case 2:
+    case 4:
+        return absdiffc_simd_channels(in, scalar, out, width, chan);
+    case 3:
+        return absdiffc_simd_c3(in, scalar, out, width);
+    default:
+        break;
+    }
+
+    return 0;
+}
+#endif  // CV_SIMD
+
+template<typename DST, typename SRC>
+static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
+{
+    const auto *in = src.InLine<SRC>(0);
+    auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan = dst.meta().chan;
+
+    int w = 0;
+#if CV_SIMD
+    w = absdiffc_simd(in, scalar, out, width, chan);
+#endif
+
+    for (; w < width*chan; ++w)
+        out[w] = absdiff<DST>(in[w], scalar[w%chan]);
+}
+
 template<typename DST, typename SRC>
 static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
                          float scale=1)
@@ -861,11 +1248,6 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar
 
     switch (arithm)
     {
-    case ARITHM_ABSDIFF:
-        for (int w=0; w < width; w++)
-            for (int c=0; c < chan; c++)
-                out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
-        break;
     case ARITHM_ADD:
         if (usemyscal)
         {
@@ -960,26 +1342,47 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
     }
 }
 
-GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false)
+GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
 {
     static const int Window = 1;
 
-    static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst)
+    static void run(const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch)
     {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar[i % chan]);
+        }
+
+        const float* scalar = scratch.OutLine<float>();
 
         //     DST     SRC     OP            __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
-        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
-        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
+        UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar);
+        UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar);
+        UNARY_(short, short, run_absdiffc, dst, src, scalar);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
+
+    static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch)
+    {
+#if CV_SIMD
+        constexpr int buflen = static_cast<int>(v_float32::nlanes) + 2; // buffer size
+#else
+        constexpr int buflen = 4;
+#endif
+        cv::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = { CV_32F, 1, bufsize };
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
 };
 
 GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)
@@ -2675,40 +3078,6 @@ GAPI_FLUID_KERNEL(GFluidSqrt, cv::gapi::core::GSqrt, false)
     }
 };
 
-GAPI_FLUID_KERNEL(GFluidCopy, cv::gapi::core::GCopy, false)
-{
-    static const int Window = 1;
-
-    static void run(const View &src, Buffer &dst)
-    {
-        const auto *in  = src.InLine<uchar>(0);
-              auto *out = dst.OutLine<uchar>();
-
-        GAPI_DbgAssert(dst.length() == src.length());
-        GAPI_DbgAssert(dst.meta().chan == src.meta().chan);
-        GAPI_DbgAssert(dst.meta().depth == src.meta().depth);
-
-        int width = src.length();
-        int elem_size = CV_ELEM_SIZE(CV_MAKETYPE(src.meta().depth, src.meta().chan));
-
-        int w = 0; // cycle counter
-
-    #if CV_SIMD128
-        for (; w <= width*elem_size-16; w+=16)
-        {
-            v_uint8x16 a;
-            a = v_load(&in[w]);
-            v_store(&out[w], a);
-        }
-    #endif
-
-        for (; w < width*elem_size; w++)
-        {
-            out[w] = in[w];
-        }
-    }
-};
-
 } // namespace fliud
 } // namespace gapi
 } // namespace cv
@@ -2768,7 +3137,6 @@ cv::gapi::GKernelPackage cv::gapi::core::fluid::kernels()
             ,GFluidInRange
             ,GFluidResize
             ,GFluidSqrt
-            ,GFluidCopy
         #if 0
             ,GFluidMean        -- not fluid
             ,GFluidSum         -- not fluid
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 85c0236ff1..13daf5d6df 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -18,6 +18,7 @@
 
 #include <functional>
 #include <unordered_set>
+#include <atomic>
 
 #include <ade/util/algorithm.hpp>
 
@@ -36,6 +37,7 @@
 #include <opencv2/gapi/gtype_traits.hpp>
 #include <opencv2/gapi/infer.hpp>
 #include <opencv2/gapi/own/convert.hpp>
+#include <opencv2/gapi/gframe.hpp>
 
 #include "compiler/gobjref.hpp"
 #include "compiler/gmodel.hpp"
@@ -44,6 +46,19 @@
 #include "backends/ie/giebackend/giewrapper.hpp"
 
 #include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
+#include "logger.hpp"
+
+#if INF_ENGINE_RELEASE < 2021010000
+#include "ie_compound_blob.h"
+#endif
+
+#if defined(HAVE_TBB)
+#  include <tbb/concurrent_queue.h> // FIXME: drop it from here!
+template<typename T> using QueueClass = tbb::concurrent_bounded_queue<T>;
+#else
+#  include "executor/conc_queue.hpp"
+template<typename T> using QueueClass = cv::gapi::own::concurrent_bounded_queue<T>;
+#endif // TBB
 
 namespace IE = InferenceEngine;
 
@@ -89,8 +104,9 @@ inline IE::Layout toIELayout(const std::size_t ndims) {
 inline IE::Precision toIE(int depth) {
     switch (depth) {
     case CV_8U:  return IE::Precision::U8;
+    case CV_32S: return IE::Precision::I32;
     case CV_32F: return IE::Precision::FP32;
-    default:     GAPI_Assert(false && "Unsupported data type");
+    default:     GAPI_Assert(false && "IE. Unsupported data type");
     }
     return IE::Precision::UNSPECIFIED;
 }
@@ -98,7 +114,9 @@ inline int toCV(IE::Precision prec) {
     switch (prec) {
     case IE::Precision::U8:   return CV_8U;
     case IE::Precision::FP32: return CV_32F;
-    default:     GAPI_Assert(false && "Unsupported data type");
+    case IE::Precision::I32:  return CV_32S;
+    case IE::Precision::I64:  return CV_32S;
+    default:     GAPI_Assert(false && "IE. Unsupported data type");
     }
     return -1;
 }
@@ -112,25 +130,19 @@ inline IE::TensorDesc toIE(const cv::Mat &mat, cv::gapi::ie::TraitAs hint) {
     if (sz.dims() == 2 && hint == cv::gapi::ie::TraitAs::IMAGE)
     {
         // NB: This logic is mainly taken from IE samples
-        const size_t pixsz    = CV_ELEM_SIZE1(mat.type());
         const size_t channels = mat.channels();
         const size_t height   = mat.size().height;
         const size_t width    = mat.size().width;
 
-        const size_t strideH  = mat.step.buf[0];
-        const size_t strideW  = mat.step.buf[1];
-
-        const bool is_dense =
-            strideW == pixsz * channels &&
-            strideH == strideW * width;
-
-        if (!is_dense)
-            cv::util::throw_error(std::logic_error("Doesn't support conversion"
-                                                   " from non-dense cv::Mat"));
+        const size_t strideH  = mat.step1();
+        IE::BlockingDesc bdesc({1, height, width, channels} /* blocking dims */,
+                               {0, 2, 3, 1} /* order for NHWC   */,
+                               0            /* offset           */,
+                               {0, 0, 0, 0} /* offsets for dims */,
+                               {strideH * height, strideH, channels, 1} /* strides for dims */);
 
         return IE::TensorDesc(toIE(mat.depth()),
-                              IE::SizeVector{1, channels, height, width},
-                              IE::Layout::NHWC);
+                              IE::SizeVector{1, channels, height, width}, bdesc);
     }
 
     return IE::TensorDesc(toIE(mat.depth()), toIE(sz), toIELayout(sz.dims()));
@@ -145,12 +157,32 @@ inline IE::Blob::Ptr wrapIE(const cv::Mat &mat, cv::gapi::ie::TraitAs hint) {
         case CV_##E: return IE::make_shared_blob<T>(tDesc, const_cast<T*>(mat.ptr<T>()))
         HANDLE(8U, uint8_t);
         HANDLE(32F, float);
+        HANDLE(32S, int);
 #undef HANDLE
-    default: GAPI_Assert(false && "Unsupported data type");
+    default: GAPI_Assert(false && "IE. Unsupported data type");
     }
     return IE::Blob::Ptr{};
 }
 
+inline IE::Blob::Ptr wrapIE(const cv::MediaFrame::View& view,
+                            const cv::GFrameDesc& desc) {
+
+    switch (desc.fmt) {
+        case cv::MediaFormat::BGR: {
+            auto bgr = cv::Mat(desc.size, CV_8UC3, view.ptr[0], view.stride[0]);
+            return wrapIE(bgr, cv::gapi::ie::TraitAs::IMAGE);
+        }
+        case cv::MediaFormat::NV12: {
+            auto y_plane  = cv::Mat(desc.size, CV_8UC1, view.ptr[0], view.stride[0]);
+            auto uv_plane = cv::Mat(desc.size / 2, CV_8UC2, view.ptr[1], view.stride[1]);
+            return cv::gapi::ie::util::to_ie(y_plane, uv_plane);
+        }
+        default:
+            GAPI_Assert(false && "Unsupported media format for IE backend");
+    }
+    GAPI_Assert(false);
+}
+
 template<class MatType>
 inline void copyFromIE(const IE::Blob::Ptr &blob, MatType &mat) {
     switch (blob->getTensorDesc().getPrecision()) {
@@ -161,8 +193,16 @@ inline void copyFromIE(const IE::Blob::Ptr &blob, MatType &mat) {
             break;
         HANDLE(U8, uint8_t);
         HANDLE(FP32, float);
+        HANDLE(I32, int);
 #undef HANDLE
-    default: GAPI_Assert(false && "Unsupported data type");
+        case IE::Precision::I64: {
+            GAPI_LOG_WARNING(NULL, "INT64 isn't supported for cv::Mat. Conversion to INT32 is used.");
+            cv::gimpl::convertInt64ToInt32(blob->buffer().as<int64_t*>(),
+                                           reinterpret_cast<int*>(mat.data),
+                                           mat.total());
+            break;
+        }
+    default: GAPI_Assert(false && "IE. Unsupported data type");
     }
 }
 
@@ -192,6 +232,9 @@ struct IEUnit {
             // but ExecutableNetwork returns ConstInputsDataMap/ConstOutputsDataMap
             inputs  = cv::gimpl::ie::wrap::toInputsDataMap(this_network.GetInputsInfo());
             outputs = cv::gimpl::ie::wrap::toOutputsDataMap(this_network.GetOutputsInfo());
+            if (!params.reshape_table.empty() || !params.layer_names_to_reshape.empty()) {
+                GAPI_LOG_WARNING(NULL, "Reshape isn't supported for imported network");
+            }
         } else {
             cv::util::throw_error(std::logic_error("Unsupported ParamDesc::Kind"));
         }
@@ -217,6 +260,11 @@ struct IEUnit {
         if (params.num_out == 1u && params.output_names.empty()) {
             params.output_names = { outputs.begin()->first };
         }
+        if (!params.reshape_table.empty()) {
+            GAPI_Assert((params.reshape_table.size() + params.layer_names_to_reshape.size()) <=
+                         params.num_in &&
+                        "Number of layers to reshape must be less than or equal to number of inputs");
+        }
     }
 
     // This method is [supposed to be] called at Island compilation stage
@@ -230,63 +278,172 @@ struct IEUnit {
             non_const_this->this_network = cv::gimpl::ie::wrap::loadNetwork(non_const_this->this_plugin, net, params);
         }
 
-        auto this_request = non_const_this->this_network.CreateInferRequest();
-        // Bind const data to infer request
-        for (auto &&p : params.const_inputs) {
-            // FIXME: SetBlob is known to be inefficient,
-            // it is worth to make a customizable "initializer" and pass the
-            // cv::Mat-wrapped blob there to support IE's optimal "GetBlob idiom"
-            // Still, constant data is to set only once.
-            this_request.SetBlob(p.first, wrapIE(p.second.first, p.second.second));
-        }
-        // Bind const data to infer request
-        for (auto &&p : params.const_inputs) {
-            // FIXME: SetBlob is known to be inefficient,
-            // it is worth to make a customizable "initializer" and pass the
-            // cv::Mat-wrapped blob there to support IE's optimal "GetBlob idiom"
-            // Still, constant data is to set only once.
-            this_request.SetBlob(p.first, wrapIE(p.second.first, p.second.second));
-        }
-
-        return {this_plugin, this_network, this_request};
+        return {params, this_plugin, this_network};
     }
 };
 
-struct IECallContext
+class IECallContext
 {
-    // Input parameters passed to an inference operation.
-    std::vector<cv::GArg> args;
+public:
+    IECallContext(const IEUnit                                      &  unit,
+                  cv::gimpl::GIslandExecutable::IOutput             &  output,
+                  const cv::GArgs                                   &  args,
+                  const std::vector<cv::gimpl::RcDesc>              &  outs,
+                  std::vector<cv::gimpl::GIslandExecutable::InObj>  && input_objs,
+                  std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs);
 
-    //FIXME: avoid conversion of arguments from internal representation to OpenCV one on each call
-    //to OCV kernel. (This can be achieved by a two single time conversions in GCPUExecutable::run,
-    //once on enter for input and output arguments, and once before return for output arguments only
-    //FIXME: check if the above applies to this backend (taken from CPU)
-    std::unordered_map<std::size_t, cv::GRunArgP> results;
+    const cv::GArgs& inArgs() const;
 
     // Generic accessor API
     template<typename T>
-    const T& inArg(std::size_t input) { return args.at(input).get<T>(); }
-
-    // Syntax sugar
-    const cv::Mat&   inMat(std::size_t input) {
-        return inArg<cv::Mat>(input);
-    }
-    cv::Mat&         outMatR(std::size_t output) {
-        return *cv::util::get<cv::Mat*>(results.at(output));
+    const T& inArg(std::size_t input) const {
+        return m_args.at(input).get<T>();
     }
 
-    template<typename T> std::vector<T>& outVecR(std::size_t output) { // FIXME: the same issue
+    template<typename T>
+    std::vector<T>& outVecR(std::size_t output) {
         return outVecRef(output).wref<T>();
     }
-    cv::detail::VectorRef& outVecRef(std::size_t output) {
-        return cv::util::get<cv::detail::VectorRef>(results.at(output));
-    }
+
+    // Syntax sugar
+          cv::GShape      inShape(std::size_t input) const;
+    const cv::Mat&        inMat  (std::size_t input) const;
+    const cv::MediaFrame& inFrame(std::size_t input) const;
+
+    const cv::GRunArg& input  (std::size_t idx) const;
+          cv::GRunArgP output (std::size_t idx);
+          cv::Mat&     outMatR(std::size_t idx);
+
+    const IEUnit                          &uu;
+    cv::gimpl::GIslandExecutable::IOutput &out;
+
+    // NB: Need to gurantee that MediaFrame::View don't die until request is over.
+    using Views = std::vector<std::unique_ptr<cv::MediaFrame::View>>;
+    Views views;
+
+private:
+    cv::detail::VectorRef& outVecRef(std::size_t idx);
+
+    cv::GArg packArg(const cv::GArg &arg);
+
+    // To store input/output data from frames
+    std::vector<cv::gimpl::GIslandExecutable::InObj>  m_input_objs;
+    std::vector<cv::gimpl::GIslandExecutable::OutObj> m_output_objs;
+
+    // To simplify access to cv::Mat inside cv::RMat
+    cv::gimpl::Mag m_res;
+
+    // FIXME: avoid conversion of arguments from internal representation to OpenCV one on each call
+    //to OCV kernel. (This can be achieved by a two single time conversions in GCPUExecutable::run,
+    //once on enter for input and output arguments, and once before return for output arguments only
+    // FIXME: check if the above applies to this backend (taken from CPU)
+    std::unordered_map<std::size_t, cv::GRunArgP> m_results;
+
+    // Input parameters passed to an inference operation.
+    cv::GArgs m_args;
+    cv::GShapes m_in_shapes;
 };
 
+IECallContext::IECallContext(const IEUnit                                      &  unit,
+                             cv::gimpl::GIslandExecutable::IOutput             &  output,
+                             const cv::GArgs                                   &  args,
+                             const std::vector<cv::gimpl::RcDesc>              &  outs,
+                             std::vector<cv::gimpl::GIslandExecutable::InObj>  && input_objs,
+                             std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs)
+: uu(unit), out(output), m_input_objs(std::move(input_objs)), m_output_objs(std::move(output_objs))
+{
+    for (auto& it : m_input_objs)  cv::gimpl::magazine::bindInArg (m_res, it.first, it.second);
+    for (auto& it : m_output_objs) cv::gimpl::magazine::bindOutArg(m_res, it.first, it.second);
+
+    m_args.reserve(args.size());
+    using namespace std::placeholders;
+    ade::util::transform(args,
+                         std::back_inserter(m_args),
+                         std::bind(&IECallContext::packArg, this, _1));
+
+    ade::util::transform(args, std::back_inserter(m_in_shapes),
+            [](const cv::GArg& arg) {
+                return arg.get<cv::gimpl::RcDesc>().shape;
+            });
+
+    for (const auto out_it : ade::util::indexed(outs)) {
+        // FIXME: Can the same GArg type resolution mechanism be reused here?
+        const auto port  = ade::util::index(out_it);
+        const auto desc  = ade::util::value(out_it);
+        m_results[port] = cv::gimpl::magazine::getObjPtr(m_res, desc);
+    }
+}
+
+const cv::GArgs& IECallContext::inArgs() const {
+    return m_args;
+}
+
+cv::GShape IECallContext::inShape(std::size_t i) const {
+    return m_in_shapes[i];
+}
+
+const cv::Mat& IECallContext::inMat(std::size_t input) const {
+    return inArg<cv::Mat>(input);
+}
+
+const cv::MediaFrame& IECallContext::inFrame(std::size_t input) const {
+    return inArg<cv::MediaFrame>(input);
+}
+
+cv::Mat& IECallContext::outMatR(std::size_t idx) {
+    return *cv::util::get<cv::Mat*>(m_results.at(idx));
+}
+
+cv::GRunArgP IECallContext::output(std::size_t idx) {
+    return m_output_objs[idx].second;
+};
+
+const cv::GRunArg& IECallContext::input(std::size_t idx) const {
+    return m_input_objs[idx].second;
+}
+
+cv::detail::VectorRef& IECallContext::outVecRef(std::size_t idx) {
+    return cv::util::get<cv::detail::VectorRef>(m_results.at(idx));
+}
+
+cv::GArg IECallContext::packArg(const cv::GArg &arg) {
+    // No API placeholders allowed at this point
+    // FIXME: this check has to be done somewhere in compilation stage.
+    GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
+                && arg.kind != cv::detail::ArgKind::GSCALAR
+                && arg.kind != cv::detail::ArgKind::GARRAY);
+
+    if (arg.kind != cv::detail::ArgKind::GOBJREF) {
+        cv::util::throw_error(std::logic_error("Inference supports G-types ONLY!"));
+    }
+    GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF);
+
+    // Wrap associated CPU object (either host or an internal one)
+    // FIXME: object can be moved out!!! GExecutor faced that.
+    const cv::gimpl::RcDesc &ref = arg.get<cv::gimpl::RcDesc>();
+    switch (ref.shape)
+    {
+    case cv::GShape::GMAT: return cv::GArg(m_res.slot<cv::Mat>()[ref.id]);
+
+    // Note: .at() is intentional for GArray as object MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case cv::GShape::GARRAY:  return cv::GArg(m_res.slot<cv::detail::VectorRef>().at(ref.id));
+
+    // Note: .at() is intentional for GOpaque as object MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case cv::GShape::GOPAQUE:  return cv::GArg(m_res.slot<cv::detail::OpaqueRef>().at(ref.id));
+
+    case cv::GShape::GFRAME:  return cv::GArg(m_res.slot<cv::MediaFrame>().at(ref.id));
+
+    default:
+        cv::util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
 struct IECallable {
     static const char *name() { return "IERequestCallable"; }
-    // FIXME: Make IECallContext manage them all? (3->1)
-    using Run = std::function<void(cv::gimpl::ie::IECompiled &, const IEUnit &, IECallContext &)>;
+    using Run = std::function<void(std::shared_ptr<IECallContext>, cv::gimpl::ie::RequestPool&)>;
     Run run;
 };
 
@@ -319,8 +476,104 @@ using GConstGIEModel = ade::ConstTypedGraph
     , IEUnit
     , IECallable
     >;
+
+inline IE::Blob::Ptr extractBlob(IECallContext& ctx, std::size_t i) {
+    switch (ctx.inShape(i)) {
+        case cv::GShape::GFRAME: {
+            const auto& frame = ctx.inFrame(i);
+            ctx.views.emplace_back(new cv::MediaFrame::View(frame.access(cv::MediaFrame::Access::R)));
+            return wrapIE(*(ctx.views.back()), frame.desc());
+        }
+        case cv::GShape::GMAT: {
+            return wrapIE(ctx.inMat(i), cv::gapi::ie::TraitAs::IMAGE);
+        }
+        default:
+            GAPI_Assert("Unsupported input shape for IE backend");
+    }
+    GAPI_Assert(false);
+}
 } // anonymous namespace
 
+std::vector<InferenceEngine::InferRequest> cv::gimpl::ie::IECompiled::createInferRequests() {
+    std::vector<InferenceEngine::InferRequest> requests;
+    requests.reserve(params.nireq);
+
+    for (size_t i = 0; i < params.nireq; ++i) {
+        requests.push_back(this_network.CreateInferRequest());
+        auto& request = requests.back();
+        // Bind const data to infer request
+        for (auto &&p : params.const_inputs) {
+            // FIXME: SetBlob is known to be inefficient,
+            // it is worth to make a customizable "initializer" and pass the
+            // cv::Mat-wrapped blob there to support IE's optimal "GetBlob idiom"
+            // Still, constant data is to set only once.
+            request.SetBlob(p.first, wrapIE(p.second.first, p.second.second));
+        }
+    }
+
+    return requests;
+}
+
+class cv::gimpl::ie::RequestPool {
+public:
+    using RunF      = std::function<void(InferenceEngine::InferRequest&)>;
+    using CallbackF = std::function<void(InferenceEngine::InferRequest&)>;
+
+    // NB: The task is represented by:
+    // RunF      - function which is set blobs and run async inference.
+    // CallbackF - function which is obtain output blobs and post it to output.
+    struct Task {
+        RunF run;
+        CallbackF callback;
+    };
+
+    explicit RequestPool(std::vector<InferenceEngine::InferRequest>&& requests);
+
+    void execute(Task&& t);
+    void waitAndShutdown();
+
+private:
+    void callback(Task task, InferenceEngine::InferRequest& request, size_t id);
+
+    QueueClass<size_t>                         m_idle_ids;
+    std::vector<InferenceEngine::InferRequest> m_requests;
+};
+
+// RequestPool implementation //////////////////////////////////////////////
+cv::gimpl::ie::RequestPool::RequestPool(std::vector<InferenceEngine::InferRequest>&& requests)
+    : m_requests(std::move(requests)) {
+        for (size_t i = 0; i < m_requests.size(); ++i) {
+            m_idle_ids.push(i);
+        }
+    }
+
+void cv::gimpl::ie::RequestPool::execute(cv::gimpl::ie::RequestPool::Task&& t) {
+    size_t id = 0u;
+    m_idle_ids.pop(id);
+
+    auto& request = m_requests[id];
+
+    request.SetCompletionCallback(
+            std::bind(&cv::gimpl::ie::RequestPool::callback, this, t, std::ref(request), id));
+    t.run(request);
+}
+
+void cv::gimpl::ie::RequestPool::callback(cv::gimpl::ie::RequestPool::Task task,
+                                          InferenceEngine::InferRequest& request,
+                                          size_t id) {
+    task.callback(request);
+    m_idle_ids.push(id);
+}
+
+// NB: Not thread-safe.
+void cv::gimpl::ie::RequestPool::waitAndShutdown() {
+    // NB: It will be blocked if at least one request is busy.
+    for (size_t i = 0; i < m_requests.size(); ++i) {
+        size_t id = 0u;
+        m_idle_ids.pop(id);
+    }
+}
+
 // GCPUExcecutable implementation //////////////////////////////////////////////
 cv::gimpl::ie::GIEExecutable::GIEExecutable(const ade::Graph &g,
                                             const std::vector<ade::NodeHandle> &nodes)
@@ -335,6 +588,7 @@ cv::gimpl::ie::GIEExecutable::GIEExecutable(const ade::Graph &g,
             if (this_nh == nullptr) {
                 this_nh = nh;
                 this_iec = iem.metadata(this_nh).get<IEUnit>().compile();
+                m_reqPool.reset(new RequestPool(this_iec.createInferRequests()));
             }
             else
                 util::throw_error(std::logic_error("Multi-node inference is not supported!"));
@@ -356,87 +610,211 @@ cv::gimpl::ie::GIEExecutable::GIEExecutable(const ade::Graph &g,
     }
 }
 
-// FIXME: Document what it does
-cv::GArg cv::gimpl::ie::GIEExecutable::packArg(const cv::GArg &arg) {
-    // No API placeholders allowed at this point
-    // FIXME: this check has to be done somewhere in compilation stage.
-    GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
-                && arg.kind != cv::detail::ArgKind::GSCALAR
-                && arg.kind != cv::detail::ArgKind::GARRAY);
+void cv::gimpl::ie::GIEExecutable::run(cv::gimpl::GIslandExecutable::IInput  &in,
+                                       cv::gimpl::GIslandExecutable::IOutput &out) {
+    // General alghoritm:
+    //     1. Collect island inputs/outputs.
+    //     2. Create kernel context. (Every kernel has his own context).
+    //     3. If the EndOfStream message is recieved, wait until all passed task are done.
+    //     4.
+    //        5.1 Run the kernel.
+    //        5.2 Kernel wait for all nececcary infer requests and start asynchronous execution.
+    //        5.3 After the kernel is finished continue processing next frame.
+    //
+    //     5. If graph is compiled in non-streaming mode, wait until all tasks are done.
 
-    if (arg.kind != cv::detail::ArgKind::GOBJREF) {
-        util::throw_error(std::logic_error("Inference supports G-types ONLY!"));
-    }
-    GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF);
+    std::vector<InObj>  input_objs;
+    std::vector<OutObj> output_objs;
 
-    // Wrap associated CPU object (either host or an internal one)
-    // FIXME: object can be moved out!!! GExecutor faced that.
-    const cv::gimpl::RcDesc &ref = arg.get<cv::gimpl::RcDesc>();
-    switch (ref.shape)
+    const auto &in_desc  = in.desc();
+    const auto  in_msg   = in.get();
+
+    if (cv::util::holds_alternative<cv::gimpl::EndOfStream>(in_msg))
     {
-    case GShape::GMAT:    return GArg(m_res.slot<cv::Mat>()[ref.id]);
-
-    // Note: .at() is intentional for GArray as object MUST be already there
-    //   (and constructed by either bindIn/Out or resetInternal)
-    case GShape::GARRAY:  return GArg(m_res.slot<cv::detail::VectorRef>().at(ref.id));
-
-    // Note: .at() is intentional for GOpaque as object MUST be already there
-    //   (and constructed by either bindIn/Out or resetInternal)
-    case GShape::GOPAQUE:  return GArg(m_res.slot<cv::detail::OpaqueRef>().at(ref.id));
-
-    default:
-        util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
-    }
-}
-
-void cv::gimpl::ie::GIEExecutable::run(std::vector<InObj>  &&input_objs,
-                                       std::vector<OutObj> &&output_objs) {
-    // Update resources with run-time information - what this Island
-    // has received from user (or from another Island, or mix...)
-    // FIXME: Check input/output objects against GIsland protocol
-
-    for (auto& it : input_objs)   magazine::bindInArg (m_res, it.first, it.second);
-    for (auto& it : output_objs)  magazine::bindOutArg(m_res, it.first, it.second);
-
-    // FIXME: Running just a single node now.
-    // Not sure if need to support many of them, though
-    // FIXME: Make this island-unmergeable?
-    const auto &op = m_gm.metadata(this_nh).get<Op>();
-
-    // Initialize kernel's execution context:
-    // - Input parameters
-    IECallContext context;
-    context.args.reserve(op.args.size());
-    using namespace std::placeholders;
-    ade::util::transform(op.args,
-                          std::back_inserter(context.args),
-                          std::bind(&GIEExecutable::packArg, this, _1));
-
-    // - Output parameters.
-    for (const auto &out_it : ade::util::indexed(op.outs)) {
-        // FIXME: Can the same GArg type resolution mechanism be reused here?
-        const auto out_port  = ade::util::index(out_it);
-        const auto out_desc  = ade::util::value(out_it);
-        context.results[out_port] = magazine::getObjPtr(m_res, out_desc);
+        // (3) Wait until all passed task are done.
+        m_reqPool->waitAndShutdown();
+        out.post(cv::gimpl::EndOfStream{});
+        return;
+    }
+
+    GAPI_Assert(cv::util::holds_alternative<cv::GRunArgs>(in_msg));
+    const auto in_vector = cv::util::get<cv::GRunArgs>(in_msg);
+
+    // (1) Collect island inputs/outputs
+    input_objs.reserve(in_desc.size());
+    for (auto &&it: ade::util::zip(ade::util::toRange(in_desc),
+                    ade::util::toRange(in_vector)))
+    {
+        input_objs.emplace_back(std::get<0>(it), std::get<1>(it));
+    }
+
+    const auto &out_desc = out.desc();
+    output_objs.reserve(out_desc.size());
+    for (auto &&it: ade::util::indexed(ade::util::toRange(out_desc)))
+    {
+        output_objs.emplace_back(ade::util::value(it),
+                out.get(ade::util::checked_cast<int>(ade::util::index(it))));
     }
 
-    // And now trigger the execution
     GConstGIEModel giem(m_g);
     const auto &uu = giem.metadata(this_nh).get<IEUnit>();
+    const auto &op = m_gm.metadata(this_nh).get<Op>();
+    // (2) Create kernel context
+    auto ctx = std::make_shared<IECallContext>(uu, out, op.args, op.outs,
+            std::move(input_objs), std::move(output_objs));
+
     const auto &kk = giem.metadata(this_nh).get<IECallable>();
-    kk.run(this_iec, uu, context);
 
-    for (auto &it : output_objs) magazine::writeBack(m_res, it.first, it.second);
+    // (4) Run the kernel.
+    kk.run(ctx, *m_reqPool);
 
-    // In/Out args clean-up is mandatory now with RMat
-    for (auto &it : input_objs) magazine::unbind(m_res, it.first);
-    for (auto &it : output_objs) magazine::unbind(m_res, it.first);
+    // (5) In non-streaming mode need to wait until the all tasks are done
+    // FIXME: Is there more graceful way to handle this case ?
+    if (!m_gm.metadata().contains<Streaming>()) {
+        m_reqPool->waitAndShutdown();
+    }
 }
 
 namespace cv {
 namespace gimpl {
 namespace ie {
+static void configureInputReshapeByImage(const IE::InputInfo::Ptr& ii,
+                                         const cv::GMetaArg mm,
+                                         IE::ICNNNetwork::InputShapes& input_reshape_table) {
+    const auto& layer_name = ii->name();
+    // Finding name in reshape table
+    const auto name_pos_in_table = input_reshape_table.find(layer_name);
+    // If contains then reshape for this layer already configured by shapes
+    // otherwise create a new element of reshape table with name and dimension
+    // which based on input image size.
+    if (name_pos_in_table != input_reshape_table.end()) {
+        GAPI_Assert(false &&
+                    "Names of layers for reshape with specified dimensions shouldn't intersect with names for reshape by image");
+    }
+    cv::Size image_sz;
+    switch (mm.index()) {
+        case cv::GMetaArg::index_of<cv::GMatDesc>():
+            {
+                const auto &meta = util::get<cv::GMatDesc>(mm);
+                image_sz = meta.size;
+                break;
+            }
+        case cv::GMetaArg::index_of<cv::GFrameDesc>():
+            {
+                const auto &meta = util::get<cv::GFrameDesc>(mm);
+                image_sz = meta.size;
+                break;
+            }
+        default:
+            util::throw_error(std::runtime_error("Unsupported input meta for IE backend"));
+    }
+    auto input_dims = ii->getTensorDesc().getDims();
+    const auto size = input_dims.size();
+    if (size <= 1) {
+        GAPI_Assert(false && "Unsupported number of dimensions for reshape by image");
+    }
+    input_dims.at(size - 2) = static_cast<size_t>(image_sz.height);
+    input_dims.at(size - 1) = static_cast<size_t>(image_sz.width);
+    // Adding new element to reshape table
+    input_reshape_table.emplace(layer_name, input_dims);
+}
+
+static void configureInputInfo(const IE::InputInfo::Ptr& ii, const cv::GMetaArg mm) {
+    switch (mm.index()) {
+        case cv::GMetaArg::index_of<cv::GMatDesc>():
+        {
+            ii->setPrecision(toIE(util::get<cv::GMatDesc>(mm).depth));
+            break;
+        }
+        case cv::GMetaArg::index_of<cv::GFrameDesc>():
+        {
+            const auto &meta = util::get<cv::GFrameDesc>(mm);
+            switch (meta.fmt) {
+                case cv::MediaFormat::NV12:
+                    ii->getPreProcess().setColorFormat(IE::ColorFormat::NV12);
+                    break;
+                case cv::MediaFormat::BGR:
+                    // NB: Do nothing
+                    break;
+                default:
+                    GAPI_Assert(false && "Unsupported media format for IE backend");
+            }
+            ii->setPrecision(toIE(CV_8U));
+            break;
+        }
+        default:
+            util::throw_error(std::runtime_error("Unsupported input meta for IE backend"));
+    }
+}
+
+// NB: This is a callback used by async infer
+// to post outputs blobs (cv::GMat's).
+static void PostOutputs(InferenceEngine::InferRequest   &request,
+                        std::shared_ptr<IECallContext>   ctx) {
+    for (auto i : ade::util::iota(ctx->uu.params.num_out))
+    {
+        auto& out_mat = ctx->outMatR(i);
+        IE::Blob::Ptr this_blob = request.GetBlob(ctx->uu.params.output_names[i]);
+        copyFromIE(this_blob, out_mat);
+        auto output = ctx->output(i);
+        ctx->out.meta(output, ctx->input(0).meta);
+        ctx->out.post(std::move(output));
+
+    }
+}
+
+class PostOutputsList {
+public:
+    PostOutputsList(size_t size,
+                    std::shared_ptr<IECallContext> ctx,
+                    std::vector<std::vector<int>>&& cached_dims);
+
+    void operator()(InferenceEngine::InferRequest &request, size_t pos) const;
+
+private:
+    struct Priv {
+        size_t              size;
+        std::atomic<size_t> finished{0u};
+        std::shared_ptr<IECallContext> ctx;
+        std::vector<std::vector<int>> cached_dims;
+    };
+    std::shared_ptr<Priv> m_priv;
+};
+
+PostOutputsList::PostOutputsList(size_t size,
+                                 std::shared_ptr<IECallContext> ctx,
+                                 std::vector<std::vector<int>>&& cached_dims)
+    : m_priv(new Priv()) {
+    m_priv->size = size;
+    m_priv->ctx = ctx;
+    m_priv->cached_dims = std::move(cached_dims);
+}
+
+void PostOutputsList::operator()(InferenceEngine::InferRequest &req, size_t pos) const {
+    auto&& ctx         = m_priv->ctx;
+    auto&& cached_dims = m_priv->cached_dims;
+    auto&& finished    = m_priv->finished;
+    auto&& size        = m_priv->size;
+    for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+        std::vector<cv::Mat> &out_vec = ctx->outVecR<cv::Mat>(i);
+
+        IE::Blob::Ptr out_blob = req.GetBlob(ctx->uu.params.output_names[i]);
+        GAPI_Assert(out_blob);
+
+        // FIXME: Avoid data copy. Not sure if it is possible though
+        out_vec[pos].create(cached_dims[i], toCV(out_blob->getTensorDesc().getPrecision()));
+        copyFromIE(out_blob, out_vec[pos]);
+    }
+    ++finished;
+
+    if (finished == size) {
+        for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+            auto output = ctx->output(i);
+            ctx->out.meta(output, ctx->input(0).meta);
+            ctx->out.post(std::move(output));
+        }
+    }
+}
 
 struct Infer: public cv::detail::KernelTag {
     using API = cv::GInferBase;
@@ -456,26 +834,34 @@ struct Infer: public cv::detail::KernelTag {
 
         GConstGIEModel gm(gr);
         const auto &uu = gm.metadata(nh).get<IEUnit>();
+        IE::ICNNNetwork::InputShapes input_reshape_table = uu.params.reshape_table;
 
         // Initialize input information
         // Note our input layers list order matches the API order and so
         // meta order.
         GAPI_Assert(uu.params.input_names.size() == in_metas.size()
                     && "Known input layers count doesn't match input meta count");
-
         for (auto &&it : ade::util::zip(ade::util::toRange(uu.params.input_names),
                                         ade::util::toRange(in_metas))) {
-            auto       &&ii = uu.inputs.at(std::get<0>(it));
-            const auto & mm =              std::get<1>(it);
+            const auto &input_name = std::get<0>(it);
+            auto       &&ii = uu.inputs.at(input_name);
+            const auto & mm = std::get<1>(it);
 
-            GAPI_Assert(util::holds_alternative<cv::GMatDesc>(mm)
-                        && "Non-GMat inputs are not supported");
-
-            const auto &meta = util::get<cv::GMatDesc>(mm);
-            ii->setPrecision(toIE(meta.depth));
+            configureInputInfo(ii, mm);
+            if (uu.params.layer_names_to_reshape.find(input_name) !=
+                uu.params.layer_names_to_reshape.end()) {
+                configureInputReshapeByImage(ii, mm, input_reshape_table);
+            }
             ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
         }
 
+        // FIXME: This isn't the best place to call reshape function.
+        // Сorrect solution would be to do this in compile() method of network,
+        // but now input meta isn't passed to compile() method.
+        if (!input_reshape_table.empty()) {
+            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+        }
+
         // FIXME: It would be nice here to have an exact number of network's
         // input/output parameters. Probably GCall should store it here for us.
         // It doesn't, as far as I know..
@@ -492,32 +878,28 @@ struct Infer: public cv::detail::KernelTag {
         return result;
     }
 
-    static void run(IECompiled &iec, const IEUnit &uu, IECallContext &ctx) {
-        // non-generic version for now:
-        // - assumes all inputs/outputs are always Mats
-        for (auto i : ade::util::iota(uu.params.num_in)) {
-            // TODO: Ideally we shouldn't do SetBlob() but GetBlob() instead,
-            // and redirect our data producers to this memory
-            // (A memory dialog comes to the picture again)
-
-            const cv::Mat this_mat = ctx.inMat(i);
-            // FIXME: By default here we trait our inputs as images.
-            // May be we need to make some more intelligence here about it
-            IE::Blob::Ptr this_blob = wrapIE(this_mat, cv::gapi::ie::TraitAs::IMAGE);
-            iec.this_request.SetBlob(uu.params.input_names[i], this_blob);
-        }
-        iec.this_request.Infer();
-        for (auto i : ade::util::iota(uu.params.num_out)) {
-            // TODO: Think on avoiding copying here.
-            // Either we should ask IE to use our memory (what is not always the
-            // best policy) or use IE-allocated buffer inside (and pass it to the graph).
-            // Not a <very> big deal for classifiers and detectors,
-            // but may be critical to segmentation.
-
-            cv::Mat& out_mat = ctx.outMatR(i);
-            IE::Blob::Ptr this_blob = iec.this_request.GetBlob(uu.params.output_names[i]);
-            copyFromIE(this_blob, out_mat);
-        }
+    static void run(std::shared_ptr<IECallContext>  ctx,
+                    cv::gimpl::ie::RequestPool     &reqPool) {
+        using namespace std::placeholders;
+        reqPool.execute(
+                cv::gimpl::ie::RequestPool::Task {
+                    [ctx](InferenceEngine::InferRequest &req) {
+                        // non-generic version for now:
+                        // - assumes all inputs/outputs are always Mats
+                        for (auto i : ade::util::iota(ctx->uu.params.num_in)) {
+                            // TODO: Ideally we shouldn't do SetBlob() but GetBlob() instead,
+                            // and redirect our data producers to this memory
+                            // (A memory dialog comes to the picture again)
+                            IE::Blob::Ptr this_blob = extractBlob(*ctx, i);
+                            req.SetBlob(ctx->uu.params.input_names[i], this_blob);
+                        }
+                        // FIXME: Should it be done by kernel ?
+                        // What about to do that in RequestPool ?
+                        req.StartAsync();
+                    },
+                    std::bind(PostOutputs, _1, ctx)
+                }
+        );
     }
 };
 
@@ -534,18 +916,31 @@ struct InferROI: public cv::detail::KernelTag {
 
         GConstGIEModel gm(gr);
         const auto &uu = gm.metadata(nh).get<IEUnit>();
+        IE::ICNNNetwork::InputShapes input_reshape_table = uu.params.reshape_table;
 
         // Initialize input information
         // FIXME: So far it is pretty limited
         GAPI_Assert(1u == uu.params.input_names.size());
         GAPI_Assert(2u == in_metas.size());
 
-        // 0th is ROI, 1st is in0put image
-        auto       &&ii = uu.inputs.at(uu.params.input_names.at(0));
-        const auto &meta = util::get<cv::GMatDesc>(in_metas.at(1));
-        ii->setPrecision(toIE(meta.depth));
+        // 0th is ROI, 1st is input image
+        const auto &input_name = uu.params.input_names.at(0);
+        auto &&ii = uu.inputs.at(input_name);
+        auto &&mm = in_metas.at(1u);
+        configureInputInfo(ii, mm);
+        if (uu.params.layer_names_to_reshape.find(input_name) !=
+            uu.params.layer_names_to_reshape.end()) {
+            configureInputReshapeByImage(ii, mm, input_reshape_table);
+        }
         ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
 
+        // FIXME: This isn't the best place to call reshape function.
+        // Сorrect solution would be to do this in compile() method of network,
+        // but now input meta isn't passed to compile() method.
+        if (!input_reshape_table.empty()) {
+            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+        }
+
         // FIXME: It would be nice here to have an exact number of network's
         // input/output parameters. Probably GCall should store it here for us.
         // It doesn't, as far as I know..
@@ -562,20 +957,27 @@ struct InferROI: public cv::detail::KernelTag {
         return result;
     }
 
-    static void run(IECompiled &iec, const IEUnit &uu, IECallContext &ctx) {
-        // non-generic version for now, per the InferROI's definition
-        GAPI_Assert(uu.params.num_in == 1);
-        const auto& this_roi = ctx.inArg<cv::detail::OpaqueRef>(0).rref<cv::Rect>();
-        const auto  this_mat = ctx.inMat(1);
-        IE::Blob::Ptr this_blob = wrapIE(this_mat, cv::gapi::ie::TraitAs::IMAGE);
-        IE::Blob::Ptr roi_blob = IE::make_shared_blob(this_blob, toIE(this_roi));
-        iec.this_request.SetBlob(*uu.params.input_names.begin(), roi_blob);
-        iec.this_request.Infer();
-        for (auto i : ade::util::iota(uu.params.num_out)) {
-            cv::Mat& out_mat = ctx.outMatR(i);
-            IE::Blob::Ptr out_blob = iec.this_request.GetBlob(uu.params.output_names[i]);
-            copyFromIE(out_blob, out_mat);
-        }
+    static void run(std::shared_ptr<IECallContext>  ctx,
+                    cv::gimpl::ie::RequestPool     &reqPool) {
+        using namespace std::placeholders;
+        reqPool.execute(
+                cv::gimpl::ie::RequestPool::Task {
+                    [ctx](InferenceEngine::InferRequest &req) {
+                        GAPI_Assert(ctx->uu.params.num_in == 1);
+                        auto&& this_roi = ctx->inArg<cv::detail::OpaqueRef>(0).rref<cv::Rect>();
+
+                        IE::Blob::Ptr this_blob = extractBlob(*ctx, 1);
+
+                        req.SetBlob(*(ctx->uu.params.input_names.begin()),
+                                IE::make_shared_blob(this_blob, toIE(this_roi)));
+
+                        // FIXME: Should it be done by kernel ?
+                        // What about to do that in RequestPool ?
+                        req.StartAsync();
+                    },
+                    std::bind(PostOutputs, _1, ctx)
+                }
+        );
     }
 };
 
@@ -595,6 +997,7 @@ struct InferList: public cv::detail::KernelTag {
 
         GConstGIEModel gm(gr);
         const auto &uu = gm.metadata(nh).get<IEUnit>();
+        IE::ICNNNetwork::InputShapes input_reshape_table = uu.params.reshape_table;
 
         // Initialize input information
         // Note our input layers list order matches the API order and so
@@ -606,15 +1009,21 @@ struct InferList: public cv::detail::KernelTag {
         for (auto &&input_name : uu.params.input_names) {
             auto       &&ii = uu.inputs.at(input_name);
             const auto & mm = in_metas[idx++];
-
-            GAPI_Assert(util::holds_alternative<cv::GMatDesc>(mm)
-                        && "Non-GMat inputs are not supported");
-
-            const auto &meta = util::get<cv::GMatDesc>(mm);
-            ii->setPrecision(toIE(meta.depth));
+            configureInputInfo(ii, mm);
+            if (uu.params.layer_names_to_reshape.find(input_name) !=
+                uu.params.layer_names_to_reshape.end()) {
+                configureInputReshapeByImage(ii, mm, input_reshape_table);
+            }
             ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
         }
 
+        // FIXME: This isn't the best place to call reshape function.
+        // Сorrect solution would be to do this in compile() method of network,
+        // but now input meta isn't passed to compile() method.
+        if (!input_reshape_table.empty()) {
+            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+        }
+
         // roi-list version is much easier at the moment.
         // All our outputs are vectors which don't have
         // metadata at the moment - so just create a vector of
@@ -623,44 +1032,46 @@ struct InferList: public cv::detail::KernelTag {
                              cv::GMetaArg{cv::empty_array_desc()});
     }
 
-    static void run(IECompiled &iec, const IEUnit &uu, IECallContext &ctx) {
-        // non-generic version for now:
-        // - assumes zero input is always ROI list
-        // - assumes all inputs/outputs are always Mats
-        GAPI_Assert(uu.params.num_in == 1); // roi list is not counted in net's inputs
+    static void run(std::shared_ptr<IECallContext>  ctx,
+                    cv::gimpl::ie::RequestPool     &reqPool) {
 
-        const auto& in_roi_vec = ctx.inArg<cv::detail::VectorRef>(0u).rref<cv::Rect>();
-        const cv::Mat this_mat = ctx.inMat(1u);
-        // Since we do a ROI list inference, always assume our input buffer is image
-        IE::Blob::Ptr this_blob = wrapIE(this_mat, cv::gapi::ie::TraitAs::IMAGE);
-
-        // FIXME: This could be done ONCE at graph compile stage!
-        std::vector< std::vector<int> > cached_dims(uu.params.num_out);
-        for (auto i : ade::util::iota(uu.params.num_out)) {
-            const IE::DataPtr& ie_out = uu.outputs.at(uu.params.output_names[i]);
-            cached_dims[i] = toCV(ie_out->getTensorDesc().getDims());
-            ctx.outVecR<cv::Mat>(i).clear();
-            // FIXME: Isn't this should be done automatically
-            // by some resetInternalData(), etc? (Probably at the GExecutor level)
+        const auto& in_roi_vec = ctx->inArg<cv::detail::VectorRef>(0u).rref<cv::Rect>();
+        // NB: In case there is no input data need to post output anyway
+        if (in_roi_vec.empty()) {
+            for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+                auto output = ctx->output(i);
+                ctx->out.meta(output, ctx->input(0).meta);
+                ctx->out.post(std::move(output));
+            }
+            return;
         }
 
-        for (const auto &rc : in_roi_vec) {
-            // FIXME: Assumed only 1 input
-            IE::Blob::Ptr roi_blob = IE::make_shared_blob(this_blob, toIE(rc));
-            iec.this_request.SetBlob(uu.params.input_names[0u], roi_blob);
-            iec.this_request.Infer();
+        IE::Blob::Ptr this_blob = extractBlob(*ctx, 1);
+        std::vector<std::vector<int>> cached_dims(ctx->uu.params.num_out);
+        for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+            const IE::DataPtr& ie_out = ctx->uu.outputs.at(ctx->uu.params.output_names[i]);
+            cached_dims[i] = toCV(ie_out->getTensorDesc().getDims());
+            // FIXME: Isn't this should be done automatically
+            // by some resetInternalData(), etc? (Probably at the GExecutor level)
+            auto& out_vec = ctx->outVecR<cv::Mat>(i);
+            out_vec.clear();
+            out_vec.resize(in_roi_vec.size());
+        }
 
-            // While input is fixed to be 1,
-            // there may be still multiple outputs
-            for (auto i : ade::util::iota(uu.params.num_out)) {
-                std::vector<cv::Mat> &out_vec = ctx.outVecR<cv::Mat>(i);
-
-                IE::Blob::Ptr out_blob = iec.this_request.GetBlob(uu.params.output_names[i]);
-
-                cv::Mat out_mat(cached_dims[i], toCV(out_blob->getTensorDesc().getPrecision()));
-                copyFromIE(out_blob, out_mat);  // FIXME: Avoid data copy. Not sure if it is possible though
-                out_vec.push_back(std::move(out_mat));
-            }
+        PostOutputsList callback(in_roi_vec.size(), ctx, std::move(cached_dims));
+        for (auto&& it : ade::util::indexed(in_roi_vec)) {
+                  auto  pos = ade::util::index(it);
+            const auto& rc  = ade::util::value(it);
+            reqPool.execute(
+                cv::gimpl::ie::RequestPool::Task {
+                    [ctx, rc, this_blob](InferenceEngine::InferRequest &req) {
+                        IE::Blob::Ptr roi_blob = IE::make_shared_blob(this_blob, toIE(rc));
+                        req.SetBlob(ctx->uu.params.input_names[0u], roi_blob);
+                        req.StartAsync();
+                    },
+                    std::bind(callback, std::placeholders::_1, pos)
+                }
+            );
         }
     }
 };
@@ -680,6 +1091,7 @@ struct InferList2: public cv::detail::KernelTag {
 
         GConstGIEModel gm(gr);
         const auto &uu = gm.metadata(nh).get<IEUnit>();
+        IE::ICNNNetwork::InputShapes input_reshape_table = uu.params.reshape_table;
 
         // Initialize input information
         // Note our input layers list order matches the API order and so
@@ -696,11 +1108,30 @@ struct InferList2: public cv::detail::KernelTag {
         // "blob"-based ones)
         // FIXME: this is filtering not done, actually! GArrayDesc has
         // no hint for its underlying type!
-        const auto &mm_0   = in_metas[0u];
-        const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
-        GAPI_Assert(   !meta_0.isND()
+        const auto &mm_0 = in_metas[0u];
+        switch (in_metas[0u].index()) {
+            case cv::GMetaArg::index_of<cv::GMatDesc>(): {
+                const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
+                GAPI_Assert(   !meta_0.isND()
+                        && !meta_0.planar
+                        && "Only images are supported as the 0th argument");
+                break;
+            }
+            case cv::GMetaArg::index_of<cv::GFrameDesc>(): {
+                // FIXME: Is there any validation for GFrame ?
+                break;
+            }
+            default:
+                util::throw_error(std::runtime_error("Unsupported input meta for IE backend"));
+        }
+
+        if (util::holds_alternative<cv::GMatDesc>(mm_0)) {
+            const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
+            GAPI_Assert(   !meta_0.isND()
                     && !meta_0.planar
                     && "Only images are supported as the 0th argument");
+        }
+
         std::size_t idx = 1u;
         for (auto &&input_name : uu.params.input_names) {
                   auto &ii = uu.inputs.at(input_name);
@@ -710,8 +1141,19 @@ struct InferList2: public cv::detail::KernelTag {
 
             if (op.k.inKinds[idx] == cv::detail::OpaqueKind::CV_RECT) {
                 // This is a cv::Rect -- configure the IE preprocessing
-                ii->setPrecision(toIE(meta_0.depth));
+                configureInputInfo(ii, mm_0);
+                if (uu.params.layer_names_to_reshape.find(input_name) !=
+                    uu.params.layer_names_to_reshape.end()) {
+                    configureInputReshapeByImage(ii, mm_0, input_reshape_table);
+                }
                 ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
+
+                // FIXME: This isn't the best place to call reshape function.
+                // Сorrect solution would be to do this in compile() method of network,
+                // but now input meta isn't passed to compile() method.
+                if (!input_reshape_table.empty()) {
+                    const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+                }
             } else {
                 // This is a cv::GMat (equals to: cv::Mat)
                 // Just validate that it is really the type
@@ -729,70 +1171,61 @@ struct InferList2: public cv::detail::KernelTag {
                              cv::GMetaArg{cv::empty_array_desc()});
     }
 
-    static void run(IECompiled &iec, const IEUnit &uu, IECallContext &ctx) {
-        GAPI_Assert(ctx.args.size() > 1u
-                    && "This operation must have at least two arguments");
-
-        // Since we do a ROI list inference, always assume our input buffer is image
-        const cv::Mat mat_0  = ctx.inMat(0u);
-        IE::Blob::Ptr blob_0 = wrapIE(mat_0, cv::gapi::ie::TraitAs::IMAGE);
-
-        // Take the next argument, which must be vector (of any kind).
-        // Use it only to obtain the ROI list size (sizes of all other
-        // vectors must be equal to this one)
-        const auto list_size = ctx.inArg<cv::detail::VectorRef>(1u).size();
-
+    static void run(std::shared_ptr<IECallContext> ctx,
+                    cv::gimpl::ie::RequestPool    &reqPool) {
+        GAPI_Assert(ctx->inArgs().size() > 1u
+                && "This operation must have at least two arguments");
+        IE::Blob::Ptr blob_0 = extractBlob(*ctx, 0);
+        const auto list_size = ctx->inArg<cv::detail::VectorRef>(1u).size();
+        if (list_size == 0u) {
+            for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+                auto output = ctx->output(i);
+                ctx->out.meta(output, ctx->input(0).meta);
+                ctx->out.post(std::move(output));
+            }
+            return;
+        }
         // FIXME: This could be done ONCE at graph compile stage!
-        std::vector< std::vector<int> > cached_dims(uu.params.num_out);
-        for (auto i : ade::util::iota(uu.params.num_out)) {
-            const IE::DataPtr& ie_out = uu.outputs.at(uu.params.output_names[i]);
+        std::vector< std::vector<int> > cached_dims(ctx->uu.params.num_out);
+        for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
+            const IE::DataPtr& ie_out = ctx->uu.outputs.at(ctx->uu.params.output_names[i]);
             cached_dims[i] = toCV(ie_out->getTensorDesc().getDims());
-            ctx.outVecR<cv::Mat>(i).clear();
             // FIXME: Isn't this should be done automatically
             // by some resetInternalData(), etc? (Probably at the GExecutor level)
+            auto& out_vec = ctx->outVecR<cv::Mat>(i);
+            out_vec.clear();
+            out_vec.resize(list_size);
         }
 
-        // For every ROI in the list {{{
+        PostOutputsList callback(list_size, ctx, std::move(cached_dims));
         for (const auto &list_idx : ade::util::iota(list_size)) {
-            // For every input of the net {{{
-            for (auto in_idx : ade::util::iota(uu.params.num_in)) {
-                const auto &this_vec = ctx.inArg<cv::detail::VectorRef>(in_idx+1u);
-                GAPI_Assert(this_vec.size() == list_size);
-                // Prepare input {{{
-                IE::Blob::Ptr this_blob;
-                if (this_vec.getKind() == cv::detail::OpaqueKind::CV_RECT) {
-                    // ROI case - create an ROI blob
-                    const auto &vec = this_vec.rref<cv::Rect>();
-                    this_blob = IE::make_shared_blob(blob_0, toIE(vec[list_idx]));
-                } else if (this_vec.getKind() == cv::detail::OpaqueKind::CV_MAT) {
-                    // Mat case - create a regular blob
-                    // FIXME: NOW Assume Mats are always BLOBS (not
-                    // images)
-                    const auto &vec = this_vec.rref<cv::Mat>();
-                    const auto &mat = vec[list_idx];
-                    this_blob = wrapIE(mat, cv::gapi::ie::TraitAs::TENSOR);
-                } else {
-                    GAPI_Assert(false && "Only Rect and Mat types are supported for infer list 2!");
-                }
-                iec.this_request.SetBlob(uu.params.input_names[in_idx], this_blob);
-                // }}} (Preapre input)
-            } // }}} (For every input of the net)
+            reqPool.execute(
+                cv::gimpl::ie::RequestPool::Task {
+                    [ctx, list_idx, list_size, blob_0](InferenceEngine::InferRequest &req) {
+                        for (auto in_idx : ade::util::iota(ctx->uu.params.num_in)) {
+                            const auto &this_vec = ctx->inArg<cv::detail::VectorRef>(in_idx+1u);
+                            GAPI_Assert(this_vec.size() == list_size);
+                            IE::Blob::Ptr this_blob;
+                            if (this_vec.getKind() == cv::detail::OpaqueKind::CV_RECT) {
+                                const auto &vec = this_vec.rref<cv::Rect>();
+                                this_blob = IE::make_shared_blob(blob_0, toIE(vec[list_idx]));
+                            } else if (this_vec.getKind() == cv::detail::OpaqueKind::CV_MAT) {
+                                const auto &vec = this_vec.rref<cv::Mat>();
+                                const auto &mat = vec[list_idx];
+                                this_blob = wrapIE(mat, cv::gapi::ie::TraitAs::TENSOR);
+                            } else {
+                                GAPI_Assert(false &&
+                                        "Only Rect and Mat types are supported for infer list 2!");
+                            }
 
-            // Run infer request {{{
-            iec.this_request.Infer();
-            // }}} (Run infer request)
-
-            // For every output of the net {{{
-            for (auto i : ade::util::iota(uu.params.num_out)) {
-                // Push results to the list {{{
-                std::vector<cv::Mat> &out_vec = ctx.outVecR<cv::Mat>(i);
-                IE::Blob::Ptr out_blob = iec.this_request.GetBlob(uu.params.output_names[i]);
-                cv::Mat out_mat(cached_dims[i], toCV(out_blob->getTensorDesc().getPrecision()));
-                copyFromIE(out_blob, out_mat);  // FIXME: Avoid data copy. Not sure if it is possible though
-                out_vec.push_back(std::move(out_mat));
-                // }}} (Push results to the list)
-            } // }}} (For every output of the net)
-        } // }}} (For every ROI in the list)
+                            req.SetBlob(ctx->uu.params.input_names[in_idx], this_blob);
+                        }
+                        req.StartAsync();
+                    },
+                    std::bind(callback, std::placeholders::_1, list_idx)
+                } // task
+            );
+        } // for
     }
 };
 
@@ -821,7 +1254,7 @@ namespace {
             // NB: In case generic infer, info about in/out names is stored in operation (op.params)
             if (pp.is_generic)
             {
-                auto& info      = cv::util::any_cast<cv::InOutInfo>(op.params);
+                auto& info      = cv::util::any_cast<cv::detail::InOutInfo>(op.params);
                 pp.input_names  = info.in_names;
                 pp.output_names = info.out_names;
                 pp.num_in       = info.in_names.size();
@@ -869,6 +1302,16 @@ IE::Blob::Ptr cv::gapi::ie::util::to_ie(cv::Mat &blob) {
     return wrapIE(blob, cv::gapi::ie::TraitAs::IMAGE);
 }
 
+IE::Blob::Ptr cv::gapi::ie::util::to_ie(cv::Mat &y_plane, cv::Mat &uv_plane) {
+    auto y_blob   = wrapIE(y_plane,  cv::gapi::ie::TraitAs::IMAGE);
+    auto uv_blob  = wrapIE(uv_plane, cv::gapi::ie::TraitAs::IMAGE);
+#if INF_ENGINE_RELEASE >= 2021010000
+    return IE::make_shared_blob<IE::NV12Blob>(y_blob, uv_blob);
+#else
+    return IE::make_shared_blob<InferenceEngine::NV12Blob>(y_blob, uv_blob);
+#endif
+}
+
 #else // HAVE_INF_ENGINE
 
 cv::gapi::GBackend cv::gapi::ie::backend() {
diff --git a/modules/gapi/src/backends/ie/giebackend.hpp b/modules/gapi/src/backends/ie/giebackend.hpp
index 4bac71c378..fbfeeccd61 100644
--- a/modules/gapi/src/backends/ie/giebackend.hpp
+++ b/modules/gapi/src/backends/ie/giebackend.hpp
@@ -13,6 +13,7 @@
 #ifdef HAVE_INF_ENGINE
 
 #include <ade/util/algorithm.hpp> // type_list_index
+#include <condition_variable>
 
 #include <inference_engine.hpp>
 
@@ -23,20 +24,22 @@
 #include "backends/common/gbackend.hpp"
 #include "compiler/gislandmodel.hpp"
 
+#include "backends/ie/giebackend/giewrapper.hpp" // wrap::Plugin
+
 namespace cv {
 namespace gimpl {
 namespace ie {
 
 struct IECompiled {
-#if INF_ENGINE_RELEASE < 2019020000  // < 2019.R2
-    InferenceEngine::InferencePlugin   this_plugin;
-#else
-    InferenceEngine::Core              this_core;
-#endif
-    InferenceEngine::ExecutableNetwork this_network;
-    InferenceEngine::InferRequest      this_request;
+    std::vector<InferenceEngine::InferRequest> createInferRequests();
+
+    cv::gapi::ie::detail::ParamDesc     params;
+    cv::gimpl::ie::wrap::Plugin         this_plugin;
+    InferenceEngine::ExecutableNetwork  this_network;
 };
 
+class RequestPool;
+
 class GIEExecutable final: public GIslandExecutable
 {
     const ade::Graph &m_g;
@@ -50,11 +53,8 @@ class GIEExecutable final: public GIslandExecutable
     // List of all resources in graph (both internal and external)
     std::vector<ade::NodeHandle> m_dataNodes;
 
-    // Actual data of all resources in graph (both internal and external)
-    Mag m_res;
-
-    // Execution helpers
-    GArg packArg(const GArg &arg);
+    // To manage multiple async requests
+    std::unique_ptr<RequestPool> m_reqPool;
 
 public:
     GIEExecutable(const ade::Graph                   &graph,
@@ -65,8 +65,14 @@ public:
         GAPI_Assert(false); // Not implemented yet
     }
 
-    virtual void run(std::vector<InObj>  &&input_objs,
-                     std::vector<OutObj> &&output_objs) override;
+    virtual void run(std::vector<InObj>  &&,
+                     std::vector<OutObj> &&) override {
+        GAPI_Assert(false && "Not implemented");
+    }
+
+    virtual void run(GIslandExecutable::IInput  &in,
+                     GIslandExecutable::IOutput &out) override;
+
 };
 
 }}}
diff --git a/modules/gapi/src/backends/ie/util.hpp b/modules/gapi/src/backends/ie/util.hpp
index b16ccbe0ce..080c88498f 100644
--- a/modules/gapi/src/backends/ie/util.hpp
+++ b/modules/gapi/src/backends/ie/util.hpp
@@ -28,6 +28,7 @@ namespace util {
 GAPI_EXPORTS std::vector<int> to_ocv(const InferenceEngine::SizeVector &dims);
 GAPI_EXPORTS cv::Mat to_ocv(InferenceEngine::Blob::Ptr blob);
 GAPI_EXPORTS InferenceEngine::Blob::Ptr to_ie(cv::Mat &blob);
+GAPI_EXPORTS InferenceEngine::Blob::Ptr to_ie(cv::Mat &y_plane, cv::Mat &uv_plane);
 
 }}}}
 
diff --git a/modules/gapi/src/backends/ocl/goclbackend.cpp b/modules/gapi/src/backends/ocl/goclbackend.cpp
index 847b802fd2..dba2b27b59 100644
--- a/modules/gapi/src/backends/ocl/goclbackend.cpp
+++ b/modules/gapi/src/backends/ocl/goclbackend.cpp
@@ -231,21 +231,21 @@ void cv::gimpl::GOCLExecutable::run(std::vector<InObj>  &&input_objs,
 
         // - Output parameters.
         // FIXME: pre-allocate internal Mats, etc, according to the known meta
-        for (const auto &out_it : ade::util::indexed(op.outs))
+        for (const auto out_it : ade::util::indexed(op.outs))
         {
             // FIXME: Can the same GArg type resolution mechanism be reused here?
-            const auto out_port  = ade::util::index(out_it);
-            const auto out_desc  = ade::util::value(out_it);
+            const auto  out_port  = ade::util::index(out_it);
+            const auto& out_desc  = ade::util::value(out_it);
             context.m_results[out_port] = magazine::getObjPtr(m_res, out_desc, true);
         }
 
         // Now trigger the executable unit
         k.apply(context);
 
-        for (const auto &out_it : ade::util::indexed(op_info.expected_out_metas))
+        for (const auto out_it : ade::util::indexed(op_info.expected_out_metas))
         {
-            const auto out_index      = ade::util::index(out_it);
-            const auto expected_meta  = ade::util::value(out_it);
+            const auto  out_index      = ade::util::index(out_it);
+            const auto& expected_meta  = ade::util::value(out_it);
 
             if (!can_describe(expected_meta, context.m_results[out_index]))
             {
@@ -262,8 +262,8 @@ void cv::gimpl::GOCLExecutable::run(std::vector<InObj>  &&input_objs,
 
     for (auto &it : output_objs)
     {
-        auto& rc = it.first;
-        auto& g_arg = it.second;
+        const auto& rc    = it.first;
+              auto& g_arg = it.second;
         magazine::writeBack(m_res, rc, g_arg);
         if (rc.shape == GShape::GMAT)
         {
diff --git a/modules/gapi/src/backends/ocl/goclcore.cpp b/modules/gapi/src/backends/ocl/goclcore.cpp
index 61e03340fb..afe211dc7e 100644
--- a/modules/gapi/src/backends/ocl/goclcore.cpp
+++ b/modules/gapi/src/backends/ocl/goclcore.cpp
@@ -490,14 +490,6 @@ GAPI_OCL_KERNEL(GOCLCrop, cv::gapi::core::GCrop)
     }
 };
 
-GAPI_OCL_KERNEL(GOCLCopy, cv::gapi::core::GCopy)
-{
-    static void run(const cv::UMat& in, cv::UMat& out)
-    {
-        in.copyTo(out);
-    }
-};
-
 GAPI_OCL_KERNEL(GOCLConcatHor, cv::gapi::core::GConcatHor)
 {
     static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
@@ -590,7 +582,6 @@ cv::gapi::GKernelPackage cv::gapi::core::ocl::kernels()
          , GOCLRemap
          , GOCLFlip
          , GOCLCrop
-         , GOCLCopy
          , GOCLConcatHor
          , GOCLConcatVert
          , GOCLLUT
diff --git a/modules/gapi/src/backends/onnx/gonnxbackend.cpp b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
index 7ab386ecab..07a42a855a 100644
--- a/modules/gapi/src/backends/onnx/gonnxbackend.cpp
+++ b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
@@ -13,8 +13,15 @@
 #include <ade/util/zip_range.hpp>
 #include <opencv2/gapi/infer.hpp>
 #include <opencv2/gapi/own/convert.hpp>
+#include <opencv2/gapi/gframe.hpp>
+#include <codecvt> // wstring_convert
 
 #include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
+#include "logger.hpp"
+
+namespace {
+struct ONNXCallContext;
+}
 
 namespace cv {
 namespace gimpl {
@@ -25,12 +32,35 @@ enum TensorPosition : int {
     OUTPUT
 };
 
+static std::string pdims(const std::vector<int64_t> &dims) {
+    std::stringstream ss;
+    auto it = dims.begin();
+    ss << *it++;
+    for (; it != dims.end(); ++it) {
+        ss << '/' << *it;
+    }
+    return ss.str();
+}
+
 struct TensorInfo {
     TensorInfo() = default;
     explicit TensorInfo(const Ort::TensorTypeAndShapeInfo& info)
         : dims(info.GetShape())
         , type(info.GetElementType())
-        , is_dynamic(std::find(dims.begin(), dims.end(), -1) != dims.end()) {
+        , is_dynamic(ade::util::find(dims, -1) != dims.end()) {
+
+        // Double-check if the tensor is really dynamic
+        // Allow N to be -1
+        if (is_dynamic
+            && dims[0] == -1
+            && dims.size() > 1
+            && std::find(dims.begin() + 1, dims.end(), -1) == dims.end()) {
+
+            GAPI_LOG_WARNING(NULL, "Promoting N=-1 to N=1 for tensor " << pdims(dims));
+            dims[0] = 1;
+            is_dynamic = false;
+        }
+
         if (!is_dynamic) {
             size = std::accumulate(dims.begin(),
                                    dims.end(),
@@ -64,6 +94,8 @@ struct TensorInfo {
     cv::util::optional<MeanStdev> mstd;
 };
 
+using Views = std::vector<std::unique_ptr<cv::MediaFrame::View>>;
+
 class ONNXCompiled {
     // ONNX Resources
     // NOTE: Env must live with the session, otherwise segfaults.
@@ -74,6 +106,7 @@ class ONNXCompiled {
     std::vector<TensorInfo> in_tensor_info;
     std::vector<TensorInfo> out_tensor_info;
     bool is_dynamic = false;
+    bool is_postproc = false;
 
     // G-API <Net> description
     gapi::onnx::detail::ParamDesc params;
@@ -88,6 +121,7 @@ class ONNXCompiled {
     void Run(const std::vector<cv::Mat>& ins,
              const std::vector<cv::Mat>& outs);
 
+    std::vector<std::string> in_names_without_const;
 public:
     explicit ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp);
 
@@ -98,9 +132,12 @@ public:
     std::size_t numInputs() const { return params.num_in; }
     std::size_t numOutputs() const { return params.num_out; }
     void setInput(int i, const cv::Mat &m);
-    void setOutput(int i, cv::Mat &m);
+    void setOutput(int idx, cv::Mat &m);
     cv::Mat allocOutput(int i) const;
-
+    // Gets exMat from input
+    void extractMat(ONNXCallContext &ctx, const size_t in_idx, Views &views);
+    // Extracted cv::Mat from input cv::Mat/cv::MediaFrame
+    cv::Mat exMat;
     // Run with the assigned inputs/outputs
     void run();
 };
@@ -121,7 +158,7 @@ inline std::vector<const char*> getCharNames(const std::vector<std::string>& nam
 
 inline int getIdxByName(const std::vector<cv::gimpl::onnx::TensorInfo>& info, const std::string& name) {
     // FIXME: Cache the ordering
-    const auto it = std::find_if(info.begin(), info.end(), [&](const cv::gimpl::onnx::TensorInfo &i) {
+    const auto it = ade::util::find_if(info, [&](const cv::gimpl::onnx::TensorInfo &i) {
             return i.name == name;
         });
     GAPI_Assert(it != info.end());
@@ -132,7 +169,9 @@ inline int toCV(ONNXTensorElementDataType prec) {
     switch (prec) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return CV_8U;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return CV_32F;
-    default: GAPI_Assert(false && "Unsupported data type");
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return CV_32S;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return CV_32S;
+    default: GAPI_Assert(false && "ONNX. Unsupported data type");
     }
     return -1;
 }
@@ -146,11 +185,30 @@ inline std::vector<int> toCV(const std::vector<int64_t> &vsz) {
     return result;
 }
 
-inline cv::Mat toCV(Ort::Value &v) {
-    auto info = v.GetTensorTypeAndShapeInfo();
-    return cv::Mat(toCV(info.GetShape()),
-                   toCV(info.GetElementType()),
-                   reinterpret_cast<void*>(v.GetTensorMutableData<uint8_t*>()));
+inline void copyFromONNX(Ort::Value &v, cv::Mat& mat) {
+    const auto info = v.GetTensorTypeAndShapeInfo();
+    const auto prec = info.GetElementType();
+    const auto shape = toCV(info.GetShape());
+    mat.create(shape, toCV(prec));
+    switch (prec) {
+#define HANDLE(E,T)                                          \
+        case E: std::copy_n(v.GetTensorMutableData<T>(),     \
+                            mat.total(),                     \
+                            reinterpret_cast<T*>(mat.data)); \
+            break;
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t);
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float);
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int);
+#undef HANDLE
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+            GAPI_LOG_WARNING(NULL, "INT64 isn't supported for cv::Mat. Conversion to INT32 is used.");
+            cv::gimpl::convertInt64ToInt32(v.GetTensorMutableData<int64_t>(),
+                                           reinterpret_cast<int*>(mat.data),
+                                           mat.total());
+            break;
+        }
+    default: GAPI_Assert(false && "ONNX. Unsupported data type");
+    }
 }
 
 inline std::vector<int64_t> toORT(const cv::MatSize &sz) {
@@ -161,12 +219,13 @@ inline void preprocess(const cv::Mat& src,
                        const cv::gimpl::onnx::TensorInfo& ti,
                              cv::Mat& dst) {
     GAPI_Assert(src.depth() == CV_32F || src.depth() == CV_8U);
-
+    // CNN input type
+    const auto type = toCV(ti.type);
     if (src.depth() == CV_32F) {
         // Just pass the tensor as-is.
         // No layout or dimension transformations done here!
         // TODO: This needs to be aligned across all NN backends.
-        GAPI_Assert(toCV(ti.type) == CV_32F && "Only 32F model input is supported for 32F data");
+        GAPI_Assert(type == CV_32F && "Only 32F model input is supported for 32F input data");
         const auto tensor_dims = toORT(src.size);
         if (tensor_dims.size() == ti.dims.size()) {
             for (size_t i = 0; i < ti.dims.size(); ++i) {
@@ -180,18 +239,21 @@ inline void preprocess(const cv::Mat& src,
         dst = src;
     } else {
         // 8U input: full preprocessing path
-        GAPI_Assert(src.depth()   == CV_8U && "Only 8U data type is supported for preproc");
-        GAPI_Assert(ti.dims.size() == 4u && "Only NCHW/NHWC layouts are supported for preproc");
+        GAPI_Assert(src.depth() == CV_8U && "Only 8U data type is supported for preproc");
+        GAPI_Assert((ti.dims.size() == 4u || ti.dims.size() == 3u)
+                    && "Only NCHW/NHWC/CHW/HWC layouts are supported for preproc");
 
-        const auto ddepth = toCV(ti.type);
-        GAPI_Assert((ddepth == CV_8U || ddepth == CV_32F)
-                    && "Only 8U and 32F model input is supported for 8U data");
+        const bool with_batch = ti.dims.size() == 4u ? true : false;
+        const int shift = with_batch ? 0 : 1;
+
+        GAPI_Assert((type == CV_8U || type == CV_32F)
+                    && "Only 8U and 32F model input is supported for 8U input data");
 
         // Assess the expected input layout
         const bool is_hwc = [&](int ch) {
-            if (ti.is_grayscale)       return false; // 1,1,h,w
-            else if (ti.dims[3] == ch) return true;  // _,_,_,c
-            else if (ti.dims[1] == ch) return false; // _,c,_,_
+            if (ti.is_grayscale)               return false; // 1,1,h,w
+            else if (ti.dims[3 - shift] == ch) return true;  // ?,_,_,c
+            else if (ti.dims[1 - shift] == ch) return false; // ?,c,_,_
             else cv::util::throw_error(std::logic_error("Couldn't identify input tensor layout"));
         } (src.channels());
 
@@ -212,15 +274,15 @@ inline void preprocess(const cv::Mat& src,
             new_w = src.cols;
         } else {
             // take h & w from the ONNX tensor info
-            new_h = ti.dims[is_hwc ? 1 : 2];
-            new_w = ti.dims[is_hwc ? 2 : 3];
+            new_h = ti.dims[(is_hwc ? 1 : 2) - shift];
+            new_w = ti.dims[(is_hwc ? 2 : 3) - shift];
         }
         GAPI_Assert(new_h != -1 && new_w != -1);
 
         cv::Mat rsz, pp;
         cv::resize(csc, rsz, cv::Size(new_w, new_h));
-        if (src.depth() == CV_8U && ddepth == CV_32F) {
-            rsz.convertTo(pp, ddepth, ti.normalize ? 1.f / 255 : 1.f);
+        if (src.depth() == CV_8U && type == CV_32F) {
+            rsz.convertTo(pp, type, ti.normalize ? 1.f / 255 : 1.f);
             if (ti.mstd.has_value()) {
                 pp -= ti.mstd->mean;
                 pp /= ti.mstd->stdev;
@@ -231,7 +293,7 @@ inline void preprocess(const cv::Mat& src,
 
         if (!is_hwc && new_c > 1) {
             // Convert to CHW
-            dst.create(cv::Size(new_w, new_h * new_c), ddepth);
+            dst.create(cv::Size(new_w, new_h * new_c), type);
             std::vector<cv::Mat> planes(new_c);
             for (int ch = 0; ch < new_c; ++ch) {
                 planes[ch] = dst.rowRange(ch * new_h, (ch + 1) * new_h);
@@ -246,8 +308,12 @@ inline void preprocess(const cv::Mat& src,
         if (ti.is_dynamic) {
             // Reshape to input dimensions
             const std::vector<int> out_dims = is_hwc
-                ? std::vector<int>{1, new_h, new_w, new_c}
-                : std::vector<int>{1, new_c, new_h, new_w};
+                ? with_batch
+                    ? std::vector<int>{1, new_h, new_w, new_c}
+                    : std::vector<int>{new_h, new_w, new_c}
+                : with_batch
+                    ? std::vector<int>{1, new_c, new_h, new_w}
+                    : std::vector<int>{new_c, new_h, new_w};
             dst = dst.reshape(1, out_dims);
         } else {
             // Reshape to ONNX dimensions (no -1s there!)
@@ -256,6 +322,26 @@ inline void preprocess(const cv::Mat& src,
     }
 }
 
+void preprocess(const cv::MediaFrame::View& view,
+                const cv::GFrameDesc& desc,
+                      cv::Mat& dst) {
+    // This overload constructs cv::Mat from cv::MediaFrame
+    switch (desc.fmt) {
+        case cv::MediaFormat::BGR: {
+            dst = cv::Mat(desc.size, CV_8UC3, view.ptr[0], view.stride[0]);
+            break;
+        }
+        case cv::MediaFormat::NV12: {
+            const auto y_plane  = cv::Mat(desc.size, CV_8UC1, view.ptr[0], view.stride[0]);
+            const auto uv_plane = cv::Mat(desc.size / 2, CV_8UC2, view.ptr[1], view.stride[1]);
+            cvtColorTwoPlane(y_plane, uv_plane, dst, cv::COLOR_YUV2BGR_NV12);
+            break;
+        }
+        default:
+            GAPI_Assert(false && "Unsupported media format for ONNX backend");
+    }
+}
+
 template <typename T>
 inline Ort::Value createTensor(const Ort::MemoryInfo& memory_info,
                                const cv::gimpl::onnx::TensorInfo& tensor_params,
@@ -278,8 +364,10 @@ inline Ort::Value createTensor(const Ort::MemoryInfo& memory_info,
         return createTensor<uint8_t>(memory_info, tensor_params, data);
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
         return createTensor<float>(memory_info, tensor_params, data);
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+        return createTensor<int32_t>(memory_info, tensor_params, data);
     default:
-        GAPI_Assert(false && "Unsupported data type");
+        GAPI_Assert(false && "ONNX. Unsupported data type");
     }
     return Ort::Value{nullptr};
 }
@@ -297,7 +385,7 @@ struct ONNXUnit {
 struct ONNXCallContext {
     // Input parameters passed to an inference operation.
     std::vector<cv::GArg> args;
-
+    cv::GShapes in_shapes;
     //FIXME: avoid conversion of arguments from internal representation to OpenCV one on each call
     //to OCV kernel. (This can be achieved by a two single time conversions in GCPUExecutable::run,
     //once on enter for input and output arguments, and once before return for output arguments only
@@ -312,6 +400,11 @@ struct ONNXCallContext {
     const cv::Mat&   inMat(std::size_t input) {
         return inArg<cv::Mat>(input);
     }
+
+    const cv::MediaFrame& inFrame(std::size_t input) {
+        return inArg<cv::MediaFrame>(input);
+    }
+
     cv::Mat&         outMatR(std::size_t output) {
         return *cv::util::get<cv::Mat*>(results.at(output));
     }
@@ -403,7 +496,8 @@ cv::GArg cv::gimpl::onnx::GONNXExecutable::packArg(const cv::GArg &arg) {
     GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
                 && arg.kind != cv::detail::ArgKind::GSCALAR
                 && arg.kind != cv::detail::ArgKind::GARRAY
-                && arg.kind != cv::detail::ArgKind::GOPAQUE);
+                && arg.kind != cv::detail::ArgKind::GOPAQUE
+                && arg.kind != cv::detail::ArgKind::GFRAME);
 
     if (arg.kind != cv::detail::ArgKind::GOBJREF) {
         util::throw_error(std::logic_error("Inference supports G-types ONLY!"));
@@ -425,6 +519,8 @@ cv::GArg cv::gimpl::onnx::GONNXExecutable::packArg(const cv::GArg &arg) {
     //   (and constructed by either bindIn/Out or resetInternal)
     case GShape::GOPAQUE:  return GArg(m_res.slot<cv::detail::OpaqueRef>().at(ref.id));
 
+    case GShape::GFRAME:   return GArg(m_res.slot<cv::MediaFrame>().at(ref.id));
+
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
         break;
@@ -451,8 +547,16 @@ void cv::gimpl::onnx::GONNXExecutable::run(std::vector<InObj>  &&input_objs,
     context.args.reserve(op.args.size());
     using namespace std::placeholders;
     ade::util::transform(op.args,
-                          std::back_inserter(context.args),
-                          std::bind(&GONNXExecutable::packArg, this, _1));
+                         std::back_inserter(context.args),
+                         std::bind(&GONNXExecutable::packArg, this, _1));
+
+    // NB: Need to store inputs shape to recognize GFrame/GMat
+    context.in_shapes.reserve(op.args.size());
+    ade::util::transform(op.args,
+                         std::back_inserter(context.in_shapes),
+                         [](const cv::GArg& arg) {
+                             return arg.get<cv::gimpl::RcDesc>().shape;
+                         });
 
     // - Output parameters.
     for (const auto &out_it : ade::util::indexed(op.outs)) {
@@ -477,7 +581,6 @@ namespace onnx {
 
 ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
     : params(pp) {
-
     // Validate input parameters before allocating any resources
     if (params.num_in > 1u && params.num_in != params.input_names.size()) {
         cv::util::throw_error(std::logic_error("Please specify input layer names for "
@@ -491,7 +594,13 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
     // Create and initialize the ONNX session
     Ort::SessionOptions session_options;
     this_env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "");
+#ifndef _WIN32
     this_session = Ort::Session(this_env, params.model_path.data(), session_options);
+#else
+    std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+    std::wstring w_model_path = converter.from_bytes(params.model_path.data());
+    this_session = Ort::Session(this_env, w_model_path.data(), session_options);
+#endif
     this_memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
 
     in_tensor_info = getTensorInfo(INPUT);
@@ -507,6 +616,7 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
                                            "Please provide a custom post-processing function "
                                            "(.cfgPostProc) in network parameters"));
     }
+    is_postproc = (params.custom_post_proc != nullptr);
 
     // Update parameters based on session information
     if (params.num_in == 1u && params.input_names.empty()) {
@@ -517,8 +627,6 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
     }
 
     // Validate what is supported currently
-    GAPI_Assert(params.const_inputs.empty()
-                && "Const inputs are not currently supported");
     GAPI_Assert(std::all_of(in_tensor_info.begin(),
                             in_tensor_info.end(),
                             [](const cv::gimpl::onnx::TensorInfo &p) {
@@ -547,6 +655,17 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
         }
     }
 
+    if (!params.const_inputs.empty()) {
+        // Form input names order without const input names
+        in_names_without_const.clear();
+        std::copy_if(params.input_names.begin(), params.input_names.end(),
+                     std::back_inserter(in_names_without_const),
+                     [&](const std::string& name) {
+                        const auto it = params.const_inputs.find(name);
+                        return it == params.const_inputs.end();
+                     });
+    }
+
     // Pre-allocate vectors (not buffers) for runtime info
     in_data.resize(params.num_in);
     out_data.resize(params.num_out);
@@ -580,9 +699,9 @@ std::vector<TensorInfo> ONNXCompiled::getTensorInfo(TensorPosition pos) {
 }
 
 cv::GMatDesc ONNXCompiled::outMeta(int idx) const {
-    if (is_dynamic) {
+    if (is_dynamic || is_postproc) {
         GAPI_Assert(!params.out_metas.empty()
-                    && "Metadata must be specified if NN has dynamic inputs!");
+                    && "Metadata must be specified if NN has dynamic inputs or post-processing function is used!");
         return params.out_metas.at(idx);
     }
     const auto ort_idx = getIdxByName(out_tensor_info, params.output_names[idx]);
@@ -590,13 +709,32 @@ cv::GMatDesc ONNXCompiled::outMeta(int idx) const {
                         toCV(out_tensor_info[ort_idx].dims));
 }
 
-void ONNXCompiled::setInput(int i, const cv::Mat &m) {
-    const auto in_idx  = i;
+void ONNXCompiled::setInput(int in_idx, const cv::Mat &m) {
+    GAPI_Assert(!m.empty() && "Input data can't be empty!");
     const auto in_name = params.input_names[in_idx];
     const auto ort_idx = getIdxByName(in_tensor_info, in_name);
     preprocess(m, in_tensor_info[ort_idx], in_data[in_idx]);
 }
 
+void ONNXCompiled::extractMat(ONNXCallContext &ctx, const size_t in_idx, Views& views) {
+    switch (ctx.in_shapes[in_idx]) {
+        case cv::GShape::GFRAME: {
+            const cv::MediaFrame& frame = ctx.inFrame(in_idx);
+            views.emplace_back(new cv::MediaFrame::View(frame.access(cv::MediaFrame::Access::R)));
+            GAPI_Assert(views.size() <= numInputs());
+            preprocess(*views.back(), frame.desc(), exMat);
+            break;
+        }
+        case cv::GShape::GMAT: {
+            exMat = ctx.inMat(in_idx);
+            break;
+        }
+        default: {
+            GAPI_Assert("Unsupported input shape for ONNX backend");
+        }
+    }
+}
+
 void ONNXCompiled::setOutput(int i, cv::Mat &m) {
     // FIXME: No need in double-indexing?
     out_data[i] = m;
@@ -613,9 +751,12 @@ void ONNXCompiled::Run(const std::vector<cv::Mat>& ins,
                        const std::vector<cv::Mat>& outs) {
     std::vector<Ort::Value> in_tensors, out_tensors;
 
-    auto in_run_names  = getCharNames(params.input_names);
-
-    for (const auto it : ade::util::indexed(params.input_names)) {
+    // Layer names order for run
+    auto input_names = (in_names_without_const.empty() && params.const_inputs.empty())
+                       ? params.input_names
+                       : in_names_without_const;
+    // Creates tensors for unique names that don't contain constant input
+    for (const auto it : ade::util::indexed(input_names)) {
         auto i         = ade::util::index(it);
         auto in_name   = ade::util::value(it);
         const auto idx = getIdxByName(in_tensor_info, in_name);
@@ -624,7 +765,19 @@ void ONNXCompiled::Run(const std::vector<cv::Mat>& ins,
                                              ins[i]));
     }
 
-    if (!is_dynamic) {
+    for (auto &&c_in_pair : params.const_inputs) {
+        const auto idx = getIdxByName(in_tensor_info, c_in_pair.first);
+        in_tensors.emplace_back(createTensor(this_memory_info,
+                                             in_tensor_info[idx],
+                                             c_in_pair.second.first));
+        // Puts const input names in sequence for Run
+        // ONNXRuntime can match input tensors to CNN inputs by names
+        input_names.emplace_back(c_in_pair.first);
+    }
+    GAPI_Assert(input_names.size() == this_session.GetInputCount());
+
+    auto in_run_names  = getCharNames(input_names);
+    if (!is_dynamic && !is_postproc) {
         // Easy path - just run the session which is bound to G-API's
         // internal data
         for (auto i : ade::util::iota(params.output_names.size())) {
@@ -636,7 +789,7 @@ void ONNXCompiled::Run(const std::vector<cv::Mat>& ins,
         this_session.Run(Ort::RunOptions{nullptr},
                          in_run_names.data(),
                          &in_tensors.front(),
-                         params.input_names.size(),
+                         input_names.size(),
                          out_run_names.data(),
                          &out_tensors.front(),
                          params.output_names.size());
@@ -644,14 +797,17 @@ void ONNXCompiled::Run(const std::vector<cv::Mat>& ins,
         // Hard path - run session & user-defined post-processing
         // NOTE: use another list of output names here
         std::vector<const char*> out_names;
-        for (auto &&ti : out_tensor_info) {
-            out_names.push_back(ti.name.c_str());
-        }
+        out_names.reserve(outs.size());
+        params.names_to_remap.empty()
+            ? ade::util::transform(out_tensor_info, std::back_inserter(out_names),
+                                   [] (const TensorInfo& ti) { return ti.name.c_str(); })
+            : ade::util::transform(params.names_to_remap, std::back_inserter(out_names),
+                                   [] (const std::string& ntr) { return ntr.c_str(); });
 
         auto outputs = this_session.Run(Ort::RunOptions{nullptr},
                                         in_run_names.data(),
                                         &in_tensors.front(),
-                                        params.input_names.size(),
+                                        input_names.size(),
                                         out_names.data(),
                                         out_names.size());
         std::unordered_map<std::string, cv::Mat> onnx_outputs;
@@ -659,18 +815,32 @@ void ONNXCompiled::Run(const std::vector<cv::Mat>& ins,
 
         GAPI_Assert(outputs.size() == out_names.size());
         // Fill in ONNX tensors
-        for (auto &&iter : ade::util::zip(ade::util::toRange(out_tensor_info),
+        for (auto &&iter : ade::util::zip(ade::util::toRange(out_names),
                                           ade::util::toRange(outputs))) {
-            const auto &out_name   = std::get<0>(iter).name;
+            const auto &out_name   = std::get<0>(iter);
                   auto &out_tensor = std::get<1>(iter);
-            onnx_outputs[out_name] = toCV(out_tensor);
+            copyFromONNX(out_tensor, onnx_outputs[out_name]);
         }
-
+        std::vector<uint8_t *> tracked_mat_ptrs;
         // Fill in G-API outputs
         for (auto &&it: ade::util::indexed(params.output_names)) {
             gapi_outputs[ade::util::value(it)] = outs[ade::util::index(it)];
+            tracked_mat_ptrs.push_back(outs[ade::util::index(it)].data);
         }
         params.custom_post_proc(onnx_outputs, gapi_outputs);
+        // Checking for possible data reallocation after remapping
+        GAPI_Assert(tracked_mat_ptrs.size() == params.output_names.size());
+        for (auto &&iter : ade::util::zip(ade::util::toRange(tracked_mat_ptrs),
+                                          ade::util::toRange(params.output_names))) {
+            const auto &original_data = std::get<0>(iter);
+            const auto &received_data = gapi_outputs.at(std::get<1>(iter)).data;
+            if (original_data != received_data) {
+                cv::util::throw_error
+                    (std::logic_error
+                     ("OpenCV kernel output parameter was reallocated after remapping of ONNX output. \n"
+                      "Incorrect logic in remapping function?"));
+            }
+        }
     }
 }
 
@@ -678,6 +848,23 @@ void ONNXCompiled::run() {
     Run(in_data, out_data);
 }
 
+static void checkInputMeta(const cv::GMetaArg mm) {
+    switch (mm.index()) {
+        case cv::GMetaArg::index_of<cv::GMatDesc>(): break;
+        case cv::GMetaArg::index_of<cv::GFrameDesc>(): {
+            const auto &meta = util::get<cv::GFrameDesc>(mm);
+            switch (meta.fmt) {
+                case cv::MediaFormat::NV12: break;
+                case cv::MediaFormat::BGR:  break;
+                default:
+                    GAPI_Assert(false && "Unsupported media format for ONNX backend");
+            } break;
+        } break;
+        default:
+            util::throw_error(std::runtime_error("Unsupported input meta for ONNX backend"));
+    }
+}
+
 struct Infer: public cv::detail::KernelTag {
     using API = cv::GInferBase;
     static cv::gapi::GBackend backend()  { return cv::gapi::onnx::backend(); }
@@ -695,8 +882,7 @@ struct Infer: public cv::detail::KernelTag {
         GAPI_Assert(uu.oc->numInputs() == in_metas.size()
                     && "Known input layers count doesn't match input meta count");
         for (auto &&mm : in_metas) {
-            GAPI_Assert(util::holds_alternative<cv::GMatDesc>(mm)
-                        && "Non-GMat inputs are not supported");
+            checkInputMeta(mm);
         }
         for (auto &&idx : ade::util::iota(uu.oc->numOutputs())) {
             result.emplace_back(uu.oc->outMeta(idx));
@@ -705,8 +891,10 @@ struct Infer: public cv::detail::KernelTag {
     }
 
     static void run(const ONNXUnit &uu, ONNXCallContext &ctx) {
+        Views views;
         for (auto &&idx : ade::util::iota(uu.oc->numInputs())) {
-            uu.oc->setInput(idx, ctx.inMat(idx));
+            uu.oc->extractMat(ctx, idx, views);
+            uu.oc->setInput(idx, uu.oc->exMat);
         }
         for (auto &&idx : ade::util::iota(uu.oc->numOutputs())) {
             uu.oc->setOutput(idx, ctx.outMatR(idx));
@@ -730,7 +918,7 @@ struct InferROI: public cv::detail::KernelTag {
         const auto &uu = gm.metadata(nh).get<ONNXUnit>();
         GAPI_Assert(1u == uu.oc->numInputs());
         GAPI_Assert(2u == in_metas.size());
-
+        checkInputMeta(in_metas.at(1));
         for (auto &&idx : ade::util::iota(uu.oc->numOutputs())) {
             result.emplace_back(uu.oc->outMeta(idx));
         }
@@ -738,12 +926,12 @@ struct InferROI: public cv::detail::KernelTag {
     }
 
     static void run(const ONNXUnit &uu, ONNXCallContext &ctx) {
+        Views views;
         // non-generic version for now, per the InferROI's definition
         GAPI_Assert(uu.oc->numInputs() == 1u);
         const auto& this_roi = ctx.inArg<cv::detail::OpaqueRef>(0).rref<cv::Rect>();
-        const auto  this_mat = ctx.inMat(1);
-
-        uu.oc->setInput(0, this_mat(this_roi));
+        uu.oc->extractMat(ctx, 1, views);
+        uu.oc->setInput(0, uu.oc->exMat(this_roi));
         for (auto &&idx : ade::util::iota(uu.oc->numOutputs())) {
             uu.oc->setOutput(idx, ctx.outMatR(idx));
         }
@@ -769,10 +957,8 @@ struct InferList: public cv::detail::KernelTag {
                     && "Known input layers count doesn't match input meta count");
 
         for (auto i : ade::util::iota(uu.oc->numInputs())) {
-            const auto & mm = in_metas[i + 1];
-
-            GAPI_Assert(util::holds_alternative<cv::GMatDesc>(mm)
-                        && "Non-GMat inputs are not supported");
+            const auto &mm = in_metas[i + 1];
+            checkInputMeta(mm);
         }
 
         // roi-list version is much easier at the moment.
@@ -784,19 +970,20 @@ struct InferList: public cv::detail::KernelTag {
     }
 
     static void run(const ONNXUnit &uu, ONNXCallContext &ctx) {
+        Views views;
         // non-generic version for now:
         // - assumes input 0 is always ROI list
         // - assumes all inputs/outputs are always Mats
         GAPI_Assert(uu.oc->numInputs() == 1); // roi list is not counted in net's inputs
 
         const auto& in_roi_vec = ctx.inArg<cv::detail::VectorRef>(0u).rref<cv::Rect>();
-        const cv::Mat this_mat = ctx.inMat(1u);
 
         for (auto i : ade::util::iota(uu.oc->numOutputs())) {
             ctx.outVecR<cv::Mat>(i).clear();
         }
+        uu.oc->extractMat(ctx, 1, views);
         for (const auto &rc : in_roi_vec) {
-            uu.oc->setInput(0, this_mat(rc));
+            uu.oc->setInput(0, uu.oc->exMat(rc));
             std::vector<cv::Mat> out_mats(uu.oc->numOutputs());
             for (auto i : ade::util::iota(uu.oc->numOutputs())) {
                 out_mats[i] = uu.oc->allocOutput(i);
@@ -837,10 +1024,30 @@ struct InferList2: public cv::detail::KernelTag {
         // FIXME: this is filtering not done, actually! GArrayDesc has
         // no hint for type!
         const auto &mm_0   = in_metas[0u];
-        const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
-        GAPI_Assert(   !meta_0.isND()
-                    && !meta_0.planar
-                    && "Only images are supported as the 0th argument");
+        switch (in_metas[0u].index()) {
+            case cv::GMetaArg::index_of<cv::GMatDesc>(): {
+                const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
+                GAPI_Assert(   !meta_0.isND()
+                            && !meta_0.planar
+                            && "Only images are supported as the 0th argument");
+                break;
+            }
+            case cv::GMetaArg::index_of<cv::GFrameDesc>(): {
+                const auto &meta_0 = util::get<cv::GFrameDesc>(mm_0);
+                GAPI_Assert(   (meta_0.fmt == cv::MediaFormat::BGR)
+                            || (meta_0.fmt == cv::MediaFormat::NV12));
+                GAPI_Assert((meta_0.size.height !=0) && (meta_0.size.width !=0));
+                break;
+            }
+            default:
+                util::throw_error(std::runtime_error("Unsupported input meta for ONNX backend"));
+        }
+        if (util::holds_alternative<cv::GMatDesc>(mm_0)) {
+            const auto &meta_0 = util::get<cv::GMatDesc>(mm_0);
+            GAPI_Assert(   !meta_0.isND()
+                        && !meta_0.planar
+                        && "Only images are supported as the 0th argument");
+        }
         for (auto i : ade::util::iota(uu.oc->numInputs())) {
             const auto &mm = in_metas[i + 1];
             GAPI_Assert(util::holds_alternative<cv::GArrayDesc>(mm)
@@ -856,11 +1063,11 @@ struct InferList2: public cv::detail::KernelTag {
     }
 
     static void run(const ONNXUnit &uu, ONNXCallContext &ctx) {
+        Views views;
         GAPI_Assert(ctx.args.size() > 1u
                     && "This operation must have at least two arguments");
-
+        uu.oc->extractMat(ctx, 0, views);
         // Since we do a ROI list inference, always assume our input buffer is image
-        const cv::Mat mat_0  = ctx.inMat(0u);
         // Take the next argument, which must be vector (of any kind).
         // Use this only to obtain the ROI list size (sizes of all
         // other vectors must be equal to this one)
@@ -885,7 +1092,7 @@ struct InferList2: public cv::detail::KernelTag {
                 if (this_vec.holds<cv::Rect>()) {
                     // ROI case - create an ROI blob
                     const auto &vec = this_vec.rref<cv::Rect>();
-                    uu.oc->setInput(in_idx, mat_0(vec[list_idx]));
+                    uu.oc->setInput(in_idx, uu.oc->exMat(vec[list_idx]));
                 } else if (this_vec.holds<cv::Mat>()) {
                     // Mat case - create a regular blob
                     // FIXME: NOW Assume Mats are always BLOBS (not
diff --git a/modules/gapi/src/backends/plaidml/gplaidmlbackend.cpp b/modules/gapi/src/backends/plaidml/gplaidmlbackend.cpp
index ebce62918c..c932d5e814 100644
--- a/modules/gapi/src/backends/plaidml/gplaidmlbackend.cpp
+++ b/modules/gapi/src/backends/plaidml/gplaidmlbackend.cpp
@@ -198,9 +198,6 @@ void cv::gimpl::GPlaidMLExecutable::run(std::vector<InObj>  &&input_objs,
     exec_->run();
 
     for (auto& it : output_objs) bindOutArg(it.first, it.second);
-
-    // FIXME:
-    // PlaidML backend haven't been updated with RMat support
 }
 
 void cv::gimpl::GPlaidMLExecutable::bindInArg(const RcDesc &rc, const GRunArg  &arg)
@@ -215,10 +212,12 @@ void cv::gimpl::GPlaidMLExecutable::bindInArg(const RcDesc &rc, const GRunArg  &
 
         switch (arg.index())
         {
-        case GRunArg::index_of<cv::Mat>() :
+        case GRunArg::index_of<cv::RMat>():
         {
-            auto& arg_mat = util::get<cv::Mat>(arg);
-            binder_->input(it->second).copy_from(arg_mat.data);
+            auto& rmat = cv::util::get<cv::RMat>(arg);
+            auto  view = rmat.access(cv::RMat::Access::R);
+            auto  mat  = cv::gimpl::asMat(view);
+            binder_->input(it->second).copy_from(mat.data);
         }
         break;
         default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
@@ -243,10 +242,12 @@ void cv::gimpl::GPlaidMLExecutable::bindOutArg(const RcDesc &rc, const GRunArgP
 
         switch (arg.index())
         {
-        case GRunArgP::index_of<cv::Mat*>() :
+        case GRunArgP::index_of<cv::RMat*>() :
         {
-            auto& arg_mat = *util::get<cv::Mat*>(arg);
-            binder_->output(it->second).copy_into(arg_mat.data);
+            auto& rmat = *cv::util::get<cv::RMat*>(arg);
+            auto  view = rmat.access(cv::RMat::Access::W);
+            auto  mat  = cv::gimpl::asMat(view);
+            binder_->output(it->second).copy_into(mat.data);
         }
         break;
         default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
diff --git a/modules/gapi/src/backends/python/gpythonbackend.cpp b/modules/gapi/src/backends/python/gpythonbackend.cpp
new file mode 100644
index 0000000000..7f4a867f89
--- /dev/null
+++ b/modules/gapi/src/backends/python/gpythonbackend.cpp
@@ -0,0 +1,261 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include <ade/util/zip_range.hpp> // zip_range, indexed
+
+#include <opencv2/gapi/util/throw.hpp> // throw_error
+#include <opencv2/gapi/python/python.hpp>
+
+#include "api/gbackend_priv.hpp"
+#include "backends/common/gbackend.hpp"
+
+cv::gapi::python::GPythonKernel::GPythonKernel(cv::gapi::python::Impl run)
+    : m_run(run)
+{
+}
+
+cv::GRunArgs cv::gapi::python::GPythonKernel::operator()(const cv::gapi::python::GPythonContext& ctx)
+{
+    return m_run(ctx);
+}
+
+cv::gapi::python::GPythonFunctor::GPythonFunctor(const char* id,
+                                                 const cv::gapi::python::GPythonFunctor::Meta &meta,
+                                                 const cv::gapi::python::Impl& impl)
+    : gapi::GFunctor(id), impl_{GPythonKernel{impl}, meta}
+{
+}
+
+cv::GKernelImpl cv::gapi::python::GPythonFunctor::impl() const
+{
+    return impl_;
+}
+
+cv::gapi::GBackend cv::gapi::python::GPythonFunctor::backend() const
+{
+    return cv::gapi::python::backend();
+}
+
+namespace {
+
+struct PythonUnit
+{
+    static const char *name() { return "PythonUnit"; }
+    cv::gapi::python::GPythonKernel kernel;
+};
+
+using PythonModel = ade::TypedGraph
+    < cv::gimpl::Op
+    , PythonUnit
+    >;
+
+using ConstPythonModel = ade::ConstTypedGraph
+    < cv::gimpl::Op
+    , PythonUnit
+    >;
+
+class GPythonExecutable final: public cv::gimpl::GIslandExecutable
+{
+    virtual void run(std::vector<InObj>  &&,
+                     std::vector<OutObj> &&) override;
+
+    virtual bool allocatesOutputs() const override { return true; }
+    // Return an empty RMat since we will reuse the input.
+    // There is no need to allocate and copy 4k image here.
+    virtual cv::RMat allocate(const cv::GMatDesc&) const override { return {}; }
+
+    virtual bool canReshape() const override { return true; }
+    virtual void reshape(ade::Graph&, const cv::GCompileArgs&) override {
+        // Do nothing here
+    }
+
+public:
+    GPythonExecutable(const ade::Graph                   &,
+                      const std::vector<ade::NodeHandle> &);
+
+    const ade::Graph& m_g;
+    cv::gimpl::GModel::ConstGraph m_gm;
+    cv::gapi::python::GPythonKernel m_kernel;
+    ade::NodeHandle m_op;
+
+    cv::GTypesInfo m_out_info;
+    cv::GMetaArgs  m_in_metas;
+    cv::gimpl::Mag m_res;
+};
+
+static cv::GArg packArg(cv::gimpl::Mag& m_res, const cv::GArg &arg)
+{
+    // No API placeholders allowed at this point
+    // FIXME: this check has to be done somewhere in compilation stage.
+    GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
+                && arg.kind != cv::detail::ArgKind::GSCALAR
+                && arg.kind != cv::detail::ArgKind::GARRAY
+                && arg.kind != cv::detail::ArgKind::GOPAQUE
+                && arg.kind != cv::detail::ArgKind::GFRAME);
+
+    if (arg.kind != cv::detail::ArgKind::GOBJREF)
+    {
+        // All other cases - pass as-is, with no transformations to GArg contents.
+        return arg;
+    }
+    GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF);
+
+    // Wrap associated CPU object (either host or an internal one)
+    // FIXME: object can be moved out!!! GExecutor faced that.
+    const cv::gimpl::RcDesc &ref = arg.get<cv::gimpl::RcDesc>();
+    switch (ref.shape)
+    {
+    case cv::GShape::GMAT:    return cv::GArg(m_res.slot<cv::Mat>()   [ref.id]);
+    case cv::GShape::GSCALAR: return cv::GArg(m_res.slot<cv::Scalar>()[ref.id]);
+    // Note: .at() is intentional for GArray and GOpaque as objects MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case cv::GShape::GARRAY:  return cv::GArg(m_res.slot<cv::detail::VectorRef>().at(ref.id));
+    case cv::GShape::GOPAQUE: return cv::GArg(m_res.slot<cv::detail::OpaqueRef>().at(ref.id));
+    case cv::GShape::GFRAME:  return cv::GArg(m_res.slot<cv::MediaFrame>().at(ref.id));
+    default:
+        cv::util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+static void writeBack(cv::GRunArg& arg, cv::GRunArgP& out)
+{
+    switch (arg.index())
+    {
+        case cv::GRunArg::index_of<cv::Mat>():
+        {
+            auto& rmat = *cv::util::get<cv::RMat*>(out);
+            rmat = cv::make_rmat<cv::gimpl::RMatAdapter>(cv::util::get<cv::Mat>(arg));
+            break;
+        }
+        case cv::GRunArg::index_of<cv::Scalar>():
+        {
+            *cv::util::get<cv::Scalar*>(out) = cv::util::get<cv::Scalar>(arg);
+            break;
+        }
+        case cv::GRunArg::index_of<cv::detail::OpaqueRef>():
+        {
+            auto& oref = cv::util::get<cv::detail::OpaqueRef>(arg);
+            cv::util::get<cv::detail::OpaqueRef>(out).mov(oref);
+            break;
+        }
+        case cv::GRunArg::index_of<cv::detail::VectorRef>():
+        {
+            auto& vref = cv::util::get<cv::detail::VectorRef>(arg);
+            cv::util::get<cv::detail::VectorRef>(out).mov(vref);
+            break;
+        }
+        default:
+            GAPI_Assert(false && "Unsupported output type");
+    }
+}
+
+void GPythonExecutable::run(std::vector<InObj>  &&input_objs,
+                            std::vector<OutObj> &&output_objs)
+{
+    const auto &op = m_gm.metadata(m_op).get<cv::gimpl::Op>();
+    for (auto& it : input_objs) cv::gimpl::magazine::bindInArg(m_res, it.first, it.second);
+
+    using namespace std::placeholders;
+    cv::GArgs inputs;
+    ade::util::transform(op.args,
+                         std::back_inserter(inputs),
+                         std::bind(&packArg, std::ref(m_res), _1));
+
+
+    cv::gapi::python::GPythonContext ctx{inputs, m_in_metas, m_out_info};
+    auto outs = m_kernel(ctx);
+
+    for (auto&& it : ade::util::zip(outs, output_objs))
+    {
+        writeBack(std::get<0>(it), std::get<1>(it).second);
+    }
+}
+
+class GPythonBackendImpl final: public cv::gapi::GBackend::Priv
+{
+    virtual void unpackKernel(ade::Graph            &graph,
+            const ade::NodeHandle &op_node,
+            const cv::GKernelImpl &impl) override
+    {
+        PythonModel gm(graph);
+        const auto &kernel  = cv::util::any_cast<cv::gapi::python::GPythonKernel>(impl.opaque);
+        gm.metadata(op_node).set(PythonUnit{kernel});
+    }
+
+    virtual EPtr compile(const ade::Graph &graph,
+                         const cv::GCompileArgs &,
+                         const std::vector<ade::NodeHandle> &nodes) const override
+    {
+        return EPtr{new GPythonExecutable(graph, nodes)};
+    }
+
+    virtual bool controlsMerge() const override
+    {
+        return true;
+    }
+
+    virtual bool allowsMerge(const cv::gimpl::GIslandModel::Graph &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &) const override
+    {
+        return false;
+    }
+};
+
+GPythonExecutable::GPythonExecutable(const ade::Graph& g,
+                                     const std::vector<ade::NodeHandle>& nodes)
+    : m_g(g), m_gm(m_g)
+{
+    using namespace cv::gimpl;
+    const auto is_op = [this](const ade::NodeHandle &nh)
+    {
+        return m_gm.metadata(nh).get<NodeType>().t == NodeType::OP;
+    };
+
+    auto it = std::find_if(nodes.begin(), nodes.end(), is_op);
+    GAPI_Assert(it != nodes.end() && "No operators found for this island?!");
+
+    ConstPythonModel cag(m_g);
+
+    m_op = *it;
+    m_kernel = cag.metadata(m_op).get<PythonUnit>().kernel;
+
+    // Ensure this the only op in the graph
+    if (std::any_of(it+1, nodes.end(), is_op))
+    {
+        cv::util::throw_error
+            (std::logic_error
+             ("Internal error: Python subgraph has multiple operations"));
+    }
+
+    m_out_info.reserve(m_op->outEdges().size());
+    for (const auto &e : m_op->outEdges())
+    {
+        const auto& out_data = m_gm.metadata(e->dstNode()).get<cv::gimpl::Data>();
+        m_out_info.push_back(cv::GTypeInfo{out_data.shape, out_data.kind, out_data.ctor});
+    }
+
+    const auto& op = m_gm.metadata(m_op).get<cv::gimpl::Op>();
+    m_in_metas.resize(op.args.size());
+    GAPI_Assert(m_op->inEdges().size() > 0);
+    for (const auto &in_eh : m_op->inEdges())
+    {
+        const auto& input_port = m_gm.metadata(in_eh).get<Input>().port;
+        const auto& input_nh   = in_eh->srcNode();
+        const auto& input_meta = m_gm.metadata(input_nh).get<Data>().meta;
+        m_in_metas.at(input_port) = input_meta;
+    }
+}
+
+} // anonymous namespace
+
+cv::gapi::GBackend cv::gapi::python::backend()
+{
+    static cv::gapi::GBackend this_backend(std::make_shared<GPythonBackendImpl>());
+    return this_backend;
+}
diff --git a/modules/gapi/src/backends/render/grenderocv.cpp b/modules/gapi/src/backends/render/grenderocv.cpp
index 71be889d79..2652284668 100644
--- a/modules/gapi/src/backends/render/grenderocv.cpp
+++ b/modules/gapi/src/backends/render/grenderocv.cpp
@@ -114,8 +114,83 @@ GAPI_OCV_KERNEL_ST(RenderNV12OCVImpl, cv::gapi::wip::draw::GRenderNV12, RenderOC
     }
 };
 
+GAPI_OCV_KERNEL_ST(RenderFrameOCVImpl, cv::gapi::wip::draw::GRenderFrame, RenderOCVState)
+{
+    static void run(const cv::MediaFrame & in,
+                    const cv::gapi::wip::draw::Prims & prims,
+                    cv::MediaFrame & out,
+                    RenderOCVState & state)
+    {
+        GAPI_Assert(in.desc().fmt == cv::MediaFormat::NV12);
+
+        // FIXME: consider a better approach (aka native inplace operation)
+        // Non-intuitive logic with shared_ptr Priv class
+        out = in;
+
+        auto desc = out.desc();
+        auto w_out = out.access(cv::MediaFrame::Access::W);
+
+        auto out_y = cv::Mat(desc.size, CV_8UC1, w_out.ptr[0], w_out.stride[0]);
+        auto out_uv = cv::Mat(desc.size / 2, CV_8UC2, w_out.ptr[1], w_out.stride[1]);
+
+        auto r_in = in.access(cv::MediaFrame::Access::R);
+
+        auto in_y = cv::Mat(desc.size, CV_8UC1, r_in.ptr[0], r_in.stride[0]);
+        auto in_uv = cv::Mat(desc.size / 2, CV_8UC2, r_in.ptr[1], r_in.stride[1]);
+
+        /* FIXME How to render correctly on NV12 format ?
+         *
+         * Rendering on NV12 via OpenCV looks like this:
+         *
+         * y --------> 1)(NV12 -> YUV) -> yuv -> 2)draw -> yuv -> 3)split -------> out_y
+         *                  ^                                         |
+         *                  |                                         |
+         * uv --------------                                          `----------> out_uv
+         *
+         *
+         * 1) Collect yuv mat from two planes, uv plain in two times less than y plane
+         *    so, upsample uv in two times, with bilinear interpolation
+         *
+         * 2) Render primitives on YUV
+         *
+         * 3) Convert yuv to NV12 (using bilinear interpolation)
+         *
+         */
+
+         // NV12 -> YUV
+        cv::Mat upsample_uv, yuv;
+        cv::resize(in_uv, upsample_uv, in_uv.size() * 2, cv::INTER_LINEAR);
+        cv::merge(std::vector<cv::Mat>{in_y, upsample_uv}, yuv);
+
+        cv::gapi::wip::draw::drawPrimitivesOCVYUV(yuv, prims, state.ftpr);
+
+        // YUV -> NV12
+        cv::Mat out_u, out_v, uv_plane;
+        std::vector<cv::Mat> chs = { out_y, out_u, out_v };
+        cv::split(yuv, chs);
+        cv::merge(std::vector<cv::Mat>{chs[1], chs[2]}, uv_plane);
+        cv::resize(uv_plane, out_uv, uv_plane.size() / 2, cv::INTER_LINEAR);
+    }
+
+    static void setup(const cv::GFrameDesc&   /* in_nv12  */,
+        const cv::GArrayDesc& /* prims */,
+        std::shared_ptr<RenderOCVState>&state,
+        const cv::GCompileArgs & args)
+    {
+        using namespace cv::gapi::wip::draw;
+        auto has_freetype_font = cv::gapi::getCompileArg<freetype_font>(args);
+        state = std::make_shared<RenderOCVState>();
+
+        if (has_freetype_font)
+        {
+            state->ftpr = std::make_shared<FTTextRender>(has_freetype_font->path);
+        }
+    }
+};
+
+
 cv::gapi::GKernelPackage cv::gapi::render::ocv::kernels()
 {
-    const static auto pkg = cv::gapi::kernels<RenderBGROCVImpl, RenderNV12OCVImpl>();
+    const static auto pkg = cv::gapi::kernels<RenderBGROCVImpl, RenderNV12OCVImpl, RenderFrameOCVImpl>();
     return pkg;
 }
diff --git a/modules/gapi/src/backends/streaming/gstreamingbackend.cpp b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
new file mode 100644
index 0000000000..09abd32c05
--- /dev/null
+++ b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
@@ -0,0 +1,438 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#include <mutex>
+
+#if !defined(GAPI_STANDALONE)
+#include <opencv2/imgproc.hpp>
+#endif // !defined(GAPI_STANDALONE)
+
+#include <opencv2/gapi/util/throw.hpp> // throw_error
+#include <opencv2/gapi/streaming/format.hpp> // kernels
+
+#include "logger.hpp"
+#include "api/gbackend_priv.hpp"
+#include "backends/common/gbackend.hpp"
+
+#include "gstreamingbackend.hpp"
+#include "gstreamingkernel.hpp"
+
+namespace {
+
+struct StreamingCreateFunction
+{
+    static const char *name() { return "StreamingCreateFunction";  }
+    cv::gapi::streaming::CreateActorFunction createActorFunction;
+};
+
+using StreamingGraph = ade::TypedGraph
+    < cv::gimpl::Op
+    , StreamingCreateFunction
+    >;
+
+using ConstStreamingGraph = ade::ConstTypedGraph
+    < cv::gimpl::Op
+    , StreamingCreateFunction
+    >;
+
+class GStreamingIntrinExecutable final: public cv::gimpl::GIslandExecutable
+{
+    virtual void run(std::vector<InObj>  &&,
+                     std::vector<OutObj> &&) override {
+        GAPI_Assert(false && "Not implemented");
+    }
+
+    virtual void run(GIslandExecutable::IInput &in,
+                     GIslandExecutable::IOutput &out) override;
+
+    virtual bool allocatesOutputs() const override { return true; }
+    // Return an empty RMat since we will reuse the input.
+    // There is no need to allocate and copy 4k image here.
+    virtual cv::RMat allocate(const cv::GMatDesc&) const override { return {}; }
+
+    virtual bool canReshape() const override { return true; }
+    virtual void reshape(ade::Graph&, const cv::GCompileArgs&) override {
+        // Do nothing here
+    }
+
+public:
+    GStreamingIntrinExecutable(const ade::Graph                   &,
+                               const cv::GCompileArgs             &,
+                               const std::vector<ade::NodeHandle> &);
+
+    const ade::Graph& m_g;
+    cv::gimpl::GModel::ConstGraph m_gm;
+    cv::gapi::streaming::IActor::Ptr m_actor;
+};
+
+void GStreamingIntrinExecutable::run(GIslandExecutable::IInput  &in,
+                                     GIslandExecutable::IOutput &out)
+{
+    m_actor->run(in, out);
+}
+
+class GStreamingBackendImpl final: public cv::gapi::GBackend::Priv
+{
+    virtual void unpackKernel(ade::Graph            &graph,
+                              const ade::NodeHandle &op_node,
+                              const cv::GKernelImpl &impl) override
+    {
+        StreamingGraph gm(graph);
+        const auto &kimpl  = cv::util::any_cast<cv::gapi::streaming::GStreamingKernel>(impl.opaque);
+        gm.metadata(op_node).set(StreamingCreateFunction{kimpl.createActorFunction});
+    }
+
+    virtual EPtr compile(const ade::Graph &graph,
+                         const cv::GCompileArgs &args,
+                         const std::vector<ade::NodeHandle> &nodes) const override
+    {
+        return EPtr{new GStreamingIntrinExecutable(graph, args, nodes)};
+    }
+
+    virtual bool controlsMerge() const override
+    {
+        return true;
+    }
+
+    virtual bool allowsMerge(const cv::gimpl::GIslandModel::Graph &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &) const override
+    {
+        return false;
+    }
+};
+
+GStreamingIntrinExecutable::GStreamingIntrinExecutable(const ade::Graph& g,
+                                                       const cv::GCompileArgs& args,
+                                                       const std::vector<ade::NodeHandle>& nodes)
+    : m_g(g), m_gm(m_g)
+{
+    using namespace cv::gimpl;
+    const auto is_op = [this](const ade::NodeHandle &nh)
+    {
+        return m_gm.metadata(nh).get<NodeType>().t == NodeType::OP;
+    };
+
+    auto it = std::find_if(nodes.begin(), nodes.end(), is_op);
+    GAPI_Assert(it != nodes.end() && "No operators found for this island?!");
+
+    ConstStreamingGraph cag(m_g);
+    m_actor = cag.metadata(*it).get<StreamingCreateFunction>().createActorFunction(args);
+
+    // Ensure this the only op in the graph
+    if (std::any_of(it+1, nodes.end(), is_op))
+    {
+        cv::util::throw_error
+            (std::logic_error
+             ("Internal error: Streaming subgraph has multiple operations"));
+    }
+}
+
+} // anonymous namespace
+
+cv::gapi::GBackend cv::gapi::streaming::backend()
+{
+    static cv::gapi::GBackend this_backend(std::make_shared<GStreamingBackendImpl>());
+    return this_backend;
+}
+
+struct Copy: public cv::detail::KernelTag
+{
+    using API = cv::gimpl::streaming::GCopy;
+
+    static cv::gapi::GBackend backend() { return cv::gapi::streaming::backend(); }
+
+    class Actor final: public cv::gapi::streaming::IActor
+    {
+        public:
+            explicit Actor(const cv::GCompileArgs&) {}
+            virtual void run(cv::gimpl::GIslandExecutable::IInput  &in,
+                             cv::gimpl::GIslandExecutable::IOutput &out) override;
+    };
+
+    static cv::gapi::streaming::IActor::Ptr create(const cv::GCompileArgs& args)
+    {
+        return cv::gapi::streaming::IActor::Ptr(new Actor(args));
+    }
+
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+};
+
+void Copy::Actor::run(cv::gimpl::GIslandExecutable::IInput  &in,
+                      cv::gimpl::GIslandExecutable::IOutput &out)
+{
+    const auto in_msg = in.get();
+    if (cv::util::holds_alternative<cv::gimpl::EndOfStream>(in_msg))
+    {
+        out.post(cv::gimpl::EndOfStream{});
+        return;
+    }
+
+    const cv::GRunArgs &in_args = cv::util::get<cv::GRunArgs>(in_msg);
+    GAPI_Assert(in_args.size() == 1u);
+
+    const auto& in_arg = in_args[0];
+    auto out_arg = out.get(0);
+    using cv::util::get;
+    switch (in_arg.index()) {
+    case cv::GRunArg::index_of<cv::RMat>():
+        *get<cv::RMat*>(out_arg) = get<cv::RMat>(in_arg);
+        break;
+    case cv::GRunArg::index_of<cv::MediaFrame>():
+        *get<cv::MediaFrame*>(out_arg) = get<cv::MediaFrame>(in_arg);
+        break;
+    // FIXME: Add support for remaining types
+    default:
+        GAPI_Assert(false && "Copy: unsupported data type");
+    }
+    out.meta(out_arg, in_arg.meta);
+    out.post(std::move(out_arg));
+}
+
+cv::gapi::GKernelPackage cv::gimpl::streaming::kernels()
+{
+    return cv::gapi::kernels<Copy>();
+}
+
+#if !defined(GAPI_STANDALONE)
+
+class GAccessorActorBase : public cv::gapi::streaming::IActor {
+public:
+    explicit GAccessorActorBase(const cv::GCompileArgs&) {}
+    virtual void run(cv::gimpl::GIslandExecutable::IInput  &in,
+                     cv::gimpl::GIslandExecutable::IOutput &out) override {
+        const auto in_msg = in.get();
+        if (cv::util::holds_alternative<cv::gimpl::EndOfStream>(in_msg))
+        {
+            out.post(cv::gimpl::EndOfStream{});
+            return;
+        }
+
+        const cv::GRunArgs &in_args = cv::util::get<cv::GRunArgs>(in_msg);
+        GAPI_Assert(in_args.size() == 1u);
+        auto frame = cv::util::get<cv::MediaFrame>(in_args[0]);
+
+        cv::GRunArgP out_arg = out.get(0);
+        auto& rmat = *cv::util::get<cv::RMat*>(out_arg);
+
+        extractRMat(frame, rmat);
+
+        out.meta(out_arg, in_args[0].meta);
+        out.post(std::move(out_arg));
+    }
+
+    virtual void extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat) = 0;
+
+protected:
+    std::once_flag m_warnFlag;
+};
+
+struct GOCVBGR: public cv::detail::KernelTag
+{
+    using API = cv::gapi::streaming::GBGR;
+    static cv::gapi::GBackend backend() { return cv::gapi::streaming::backend(); }
+
+    class Actor final: public GAccessorActorBase
+    {
+    public:
+        using GAccessorActorBase::GAccessorActorBase;
+        virtual void extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat) override;
+    };
+
+    static cv::gapi::streaming::IActor::Ptr create(const cv::GCompileArgs& args)
+    {
+        return cv::gapi::streaming::IActor::Ptr(new Actor(args));
+    }
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+};
+
+void GOCVBGR::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
+{
+    const auto& desc = frame.desc();
+    switch (desc.fmt)
+    {
+        case cv::MediaFormat::BGR:
+        {
+            rmat = cv::make_rmat<cv::gimpl::RMatMediaFrameAdapter>(frame,
+            [](const cv::GFrameDesc& d){ return cv::GMatDesc(CV_8U, 3, d.size); },
+            [](const cv::GFrameDesc& d, const cv::MediaFrame::View& v){
+                return cv::Mat(d.size, CV_8UC3, v.ptr[0], v.stride[0]);
+            });
+            break;
+        }
+        case cv::MediaFormat::NV12:
+        {
+            std::call_once(m_warnFlag,
+                [](){
+                    GAPI_LOG_WARNING(NULL, "\nOn-the-fly conversion from NV12 to BGR will happen.\n"
+                        "Conversion may cost a lot for images with high resolution.\n"
+                        "To retrieve cv::Mat-s from NV12 cv::MediaFrame for free, you may use "
+                        "cv::gapi::streaming::Y and cv::gapi::streaming::UV accessors.\n");
+                });
+
+            cv::Mat bgr;
+            auto view = frame.access(cv::MediaFrame::Access::R);
+            cv::Mat y_plane (desc.size,     CV_8UC1, view.ptr[0], view.stride[0]);
+            cv::Mat uv_plane(desc.size / 2, CV_8UC2, view.ptr[1], view.stride[1]);
+            cv::cvtColorTwoPlane(y_plane, uv_plane, bgr, cv::COLOR_YUV2BGR_NV12);
+            rmat = cv::make_rmat<cv::gimpl::RMatAdapter>(bgr);
+            break;
+        }
+        default:
+            cv::util::throw_error(
+                    std::logic_error("Unsupported MediaFormat for cv::gapi::streaming::BGR"));
+    }
+}
+
+struct GOCVY: public cv::detail::KernelTag
+{
+    using API = cv::gapi::streaming::GY;
+    static cv::gapi::GBackend backend() { return cv::gapi::streaming::backend(); }
+
+    class Actor final: public GAccessorActorBase
+    {
+    public:
+        using GAccessorActorBase::GAccessorActorBase;
+        virtual void extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat) override;
+    };
+
+    static cv::gapi::streaming::IActor::Ptr create(const cv::GCompileArgs& args)
+    {
+        return cv::gapi::streaming::IActor::Ptr(new Actor(args));
+    }
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+};
+
+void GOCVY::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
+{
+    const auto& desc = frame.desc();
+    switch (desc.fmt)
+    {
+        case cv::MediaFormat::BGR:
+        {
+            std::call_once(m_warnFlag,
+                [](){
+                    GAPI_LOG_WARNING(NULL, "\nOn-the-fly conversion from BGR to NV12 Y plane will "
+                        "happen.\n"
+                        "Conversion may cost a lot for images with high resolution.\n"
+                        "To retrieve cv::Mat from BGR cv::MediaFrame for free, you may use "
+                        "cv::gapi::streaming::BGR accessor.\n");
+                });
+
+            auto view = frame.access(cv::MediaFrame::Access::R);
+            cv::Mat tmp_bgr(desc.size, CV_8UC3, view.ptr[0], view.stride[0]);
+            cv::Mat yuv;
+            cvtColor(tmp_bgr, yuv, cv::COLOR_BGR2YUV_I420);
+            rmat = cv::make_rmat<cv::gimpl::RMatAdapter>(yuv.rowRange(0, desc.size.height));
+            break;
+        }
+        case cv::MediaFormat::NV12:
+        {
+            rmat = cv::make_rmat<cv::gimpl::RMatMediaFrameAdapter>(frame,
+            [](const cv::GFrameDesc& d){ return cv::GMatDesc(CV_8U, 1, d.size); },
+            [](const cv::GFrameDesc& d, const cv::MediaFrame::View& v){
+                return cv::Mat(d.size, CV_8UC1, v.ptr[0], v.stride[0]);
+            });
+            break;
+        }
+        default:
+            cv::util::throw_error(
+                    std::logic_error("Unsupported MediaFormat for cv::gapi::streaming::Y"));
+    }
+}
+
+struct GOCVUV: public cv::detail::KernelTag
+{
+    using API = cv::gapi::streaming::GUV;
+    static cv::gapi::GBackend backend() { return cv::gapi::streaming::backend(); }
+
+    class Actor final: public GAccessorActorBase
+    {
+    public:
+        using GAccessorActorBase::GAccessorActorBase;
+        virtual void extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat) override;
+    };
+
+    static cv::gapi::streaming::IActor::Ptr create(const cv::GCompileArgs& args)
+    {
+        return cv::gapi::streaming::IActor::Ptr(new Actor(args));
+    }
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+};
+
+void GOCVUV::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
+{
+    const auto& desc = frame.desc();
+    switch (desc.fmt)
+    {
+        case cv::MediaFormat::BGR:
+        {
+            std::call_once(m_warnFlag,
+                [](){
+                    GAPI_LOG_WARNING(NULL, "\nOn-the-fly conversion from BGR to NV12 UV plane will "
+                        "happen.\n"
+                        "Conversion may cost a lot for images with high resolution.\n"
+                        "To retrieve cv::Mat from BGR cv::MediaFrame for free, you may use "
+                        "cv::gapi::streaming::BGR accessor.\n");
+                });
+
+            auto view = frame.access(cv::MediaFrame::Access::R);
+
+            cv::Mat tmp_bgr(desc.size, CV_8UC3, view.ptr[0], view.stride[0]);
+            cv::Mat yuv;
+            cvtColor(tmp_bgr, yuv, cv::COLOR_BGR2YUV_I420);
+
+            cv::Mat uv;
+            std::vector<int> dims = { desc.size.height / 2,
+                                        desc.size.width / 2  };
+            auto start = desc.size.height;
+            auto range_h = desc.size.height / 4;
+            std::vector<cv::Mat> uv_planes = {
+                yuv.rowRange(start, start + range_h).reshape(0, dims),
+                yuv.rowRange(start + range_h, start + range_h * 2).reshape(0, dims)
+            };
+            cv::merge(uv_planes, uv);
+            rmat = cv::make_rmat<cv::gimpl::RMatAdapter>(uv);
+            break;
+        }
+        case cv::MediaFormat::NV12:
+        {
+            rmat = cv::make_rmat<cv::gimpl::RMatMediaFrameAdapter>(frame,
+            [](const cv::GFrameDesc& d){ return cv::GMatDesc(CV_8U, 2, d.size / 2); },
+            [](const cv::GFrameDesc& d, const cv::MediaFrame::View& v){
+                return cv::Mat(d.size / 2, CV_8UC2, v.ptr[1], v.stride[1]);
+            });
+            break;
+        }
+        default:
+            cv::util::throw_error(
+                    std::logic_error("Unsupported MediaFormat for cv::gapi::streaming::UV"));
+    }
+}
+
+cv::gapi::GKernelPackage cv::gapi::streaming::kernels()
+{
+    return cv::gapi::kernels<GOCVBGR, GOCVY, GOCVUV>();
+}
+
+#else
+
+cv::gapi::GKernelPackage cv::gapi::streaming::kernels()
+{
+    // Still provide this symbol to avoid linking issues
+    util::throw_error(std::runtime_error("cv::gapi::streaming::kernels() isn't supported in standalone"));
+}
+
+#endif // !defined(GAPI_STANDALONE)
+
+cv::GMat cv::gapi::copy(const cv::GMat& in) {
+    return cv::gimpl::streaming::GCopy::on<cv::GMat>(in);
+}
+
+cv::GFrame cv::gapi::copy(const cv::GFrame& in) {
+    return cv::gimpl::streaming::GCopy::on<cv::GFrame>(in);
+}
diff --git a/modules/gapi/src/backends/streaming/gstreamingbackend.hpp b/modules/gapi/src/backends/streaming/gstreamingbackend.hpp
new file mode 100644
index 0000000000..27b5443bb2
--- /dev/null
+++ b/modules/gapi/src/backends/streaming/gstreamingbackend.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_GSTREAMINGBACKEND_HPP
+#define OPENCV_GAPI_GSTREAMINGBACKEND_HPP
+
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
+#include "gstreamingkernel.hpp"
+
+namespace cv {
+namespace gimpl {
+namespace streaming {
+
+cv::gapi::GKernelPackage kernels();
+
+struct GCopy final : public cv::detail::NoTag
+{
+    static constexpr const char* id() { return "org.opencv.streaming.copy"; }
+
+    static GMetaArgs getOutMeta(const GMetaArgs &in_meta, const GArgs&) {
+        GAPI_Assert(in_meta.size() == 1u);
+        return in_meta;
+    }
+
+    template<typename T> static T on(const T& arg) {
+        return cv::GKernelType<GCopy, std::function<T(T)>>::on(arg);
+    }
+};
+
+} // namespace streaming
+} // namespace gimpl
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMINGBACKEND_HPP
diff --git a/modules/gapi/src/backends/streaming/gstreamingkernel.hpp b/modules/gapi/src/backends/streaming/gstreamingkernel.hpp
new file mode 100644
index 0000000000..9b73b77be8
--- /dev/null
+++ b/modules/gapi/src/backends/streaming/gstreamingkernel.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSTREAMINGKERNEL_HPP
+#define OPENCV_GAPI_GSTREAMINGKERNEL_HPP
+
+#include "compiler/gislandmodel.hpp"
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+class IActor {
+public:
+    using Ptr = std::shared_ptr<IActor>;
+
+    virtual void run(cv::gimpl::GIslandExecutable::IInput  &in,
+                     cv::gimpl::GIslandExecutable::IOutput &out) = 0;
+
+    virtual ~IActor() = default;
+};
+
+using CreateActorFunction = std::function<IActor::Ptr(const cv::GCompileArgs&)>;
+struct GStreamingKernel
+{
+    CreateActorFunction createActorFunction;
+};
+
+} // namespace streaming
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMINGKERNEL_HPP
diff --git a/modules/gapi/src/compiler/gcompiled_priv.hpp b/modules/gapi/src/compiler/gcompiled_priv.hpp
index b08b1f9c59..f21bfc80bc 100644
--- a/modules/gapi/src/compiler/gcompiled_priv.hpp
+++ b/modules/gapi/src/compiler/gcompiled_priv.hpp
@@ -38,10 +38,6 @@ class GAPI_EXPORTS GCompiled::Priv
     GMetaArgs  m_outMetas; // inferred by compiler
     std::unique_ptr<cv::gimpl::GExecutor> m_exec;
 
-    // NB: Used by python wrapper to clarify input/output types
-    GTypesInfo m_out_info;
-    GTypesInfo m_in_info;
-
     void checkArgs(const cv::gimpl::GRuntimeArgs &args) const;
 
 public:
@@ -59,12 +55,6 @@ public:
     const GMetaArgs& outMetas() const;
 
     const cv::gimpl::GModel::Graph& model() const;
-
-    void setOutInfo(const GTypesInfo& info) { m_out_info = std::move(info); }
-    const GTypesInfo& outInfo() const { return m_out_info; }
-
-    void setInInfo(const GTypesInfo& info) { m_in_info = std::move(info); }
-    const GTypesInfo& inInfo() const { return m_in_info; }
 };
 
 }
diff --git a/modules/gapi/src/compiler/gcompiler.cpp b/modules/gapi/src/compiler/gcompiler.cpp
index 4d050dbabd..2f442e40bd 100644
--- a/modules/gapi/src/compiler/gcompiler.cpp
+++ b/modules/gapi/src/compiler/gcompiler.cpp
@@ -36,6 +36,7 @@
 #include "executor/gstreamingexecutor.hpp"
 #include "backends/common/gbackend.hpp"
 #include "backends/common/gmetabackend.hpp"
+#include "backends/streaming/gstreamingbackend.hpp" // cv::gimpl::streaming::kernels()
 
 // <FIXME:>
 #if !defined(GAPI_STANDALONE)
@@ -59,8 +60,11 @@ namespace
             for (const auto &b : pkg.backends()) {
                 aux_pkg = combine(aux_pkg, b.priv().auxiliaryKernels());
             }
-            // Always include built-in meta<> implementation
-            return combine(pkg, aux_pkg, cv::gimpl::meta::kernels());
+            // Always include built-in meta<> and copy implementation
+            return combine(pkg,
+                           aux_pkg,
+                           cv::gimpl::meta::kernels(),
+                           cv::gimpl::streaming::kernels());
         };
 
         auto has_use_only = cv::gapi::getCompileArg<cv::gapi::use_only>(args);
@@ -72,7 +76,8 @@ namespace
             combine(cv::gapi::core::cpu::kernels(),
                     cv::gapi::imgproc::cpu::kernels(),
                     cv::gapi::video::cpu::kernels(),
-                    cv::gapi::render::ocv::kernels());
+                    cv::gapi::render::ocv::kernels(),
+                    cv::gapi::streaming::kernels());
 #else
             cv::gapi::GKernelPackage();
 #endif // !defined(GAPI_STANDALONE)
@@ -338,7 +343,7 @@ void cv::gimpl::GCompiler::validateInputMeta()
         return false; // should never happen
     };
 
-    for (const auto &meta_arg_idx : ade::util::indexed(ade::util::zip(m_metas, c_expr.m_ins)))
+    for (const auto meta_arg_idx : ade::util::indexed(ade::util::zip(m_metas, c_expr.m_ins)))
     {
         const auto &meta  = std::get<0>(ade::util::value(meta_arg_idx));
         const auto &proto = std::get<1>(ade::util::value(meta_arg_idx));
@@ -365,7 +370,7 @@ void cv::gimpl::GCompiler::validateOutProtoArgs()
         return;
     }
     const auto &c_expr = util::get<cv::GComputation::Priv::Expr>(m_c.priv().m_shape);
-    for (const auto &out_pos : ade::util::indexed(c_expr.m_outs))
+    for (const auto out_pos : ade::util::indexed(c_expr.m_outs))
     {
         const auto &node = proto::origin_of(ade::util::value(out_pos)).node;
         if (node.shape() != cv::GNode::NodeShape::CALL)
@@ -417,19 +422,6 @@ void cv::gimpl::GCompiler::compileIslands(ade::Graph &g, const cv::GCompileArgs
     GIslandModel::compileIslands(gim, g, args);
 }
 
-static cv::GTypesInfo collectInfo(const cv::gimpl::GModel::ConstGraph& g,
-                                  const std::vector<ade::NodeHandle>& nhs) {
-    cv::GTypesInfo info;
-    info.reserve(nhs.size());
-
-    ade::util::transform(nhs, std::back_inserter(info), [&g](const ade::NodeHandle& nh) {
-        const auto& data = g.metadata(nh).get<cv::gimpl::Data>();
-        return cv::GTypeInfo{data.shape, data.kind};
-    });
-
-    return info;
-}
-
 cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
 {
     // This is the final compilation step. Here:
@@ -449,23 +441,15 @@ cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
     // ...before call to produceCompiled();
 
     GModel::ConstGraph cgr(*pg);
-
     const auto &outMetas = GModel::ConstGraph(*pg).metadata()
         .get<OutputMeta>().outMeta;
-    std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));
     // FIXME: select which executor will be actually used,
     // make GExecutor abstract.
+    std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));
 
     GCompiled compiled;
     compiled.priv().setup(m_metas, outMetas, std::move(pE));
 
-    // NB: Need to store input/output GTypeInfo to allocate output arrays for python bindings
-    auto out_meta = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().out_nhs);
-    auto in_meta  = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().in_nhs);
-
-    compiled.priv().setOutInfo(std::move(out_meta));
-    compiled.priv().setInInfo(std::move(in_meta));
-
     return compiled;
 }
 
@@ -481,16 +465,8 @@ cv::GStreamingCompiled cv::gimpl::GCompiler::produceStreamingCompiled(GPtr &&pg)
         outMetas = GModel::ConstGraph(*pg).metadata().get<OutputMeta>().outMeta;
     }
 
-
     GModel::ConstGraph cgr(*pg);
 
-    // NB: Need to store input/output GTypeInfo to allocate output arrays for python bindings
-    auto out_meta = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().out_nhs);
-    auto in_meta  = collectInfo(cgr, cgr.metadata().get<cv::gimpl::Protocol>().in_nhs);
-
-    compiled.priv().setOutInfo(std::move(out_meta));
-    compiled.priv().setInInfo(std::move(in_meta));
-
     std::unique_ptr<GStreamingExecutor> pE(new GStreamingExecutor(std::move(pg),
                                                                   m_args));
     if (!m_metas.empty() && !outMetas.empty())
diff --git a/modules/gapi/src/compiler/gislandmodel.cpp b/modules/gapi/src/compiler/gislandmodel.cpp
index 4d0feaea71..fb2457a191 100644
--- a/modules/gapi/src/compiler/gislandmodel.cpp
+++ b/modules/gapi/src/compiler/gislandmodel.cpp
@@ -18,7 +18,6 @@
 #include "compiler/gmodel.hpp"
 #include "compiler/gislandmodel.hpp"
 #include "compiler/gmodel.hpp"
-#include "backends/common/gbackend.hpp" // RMatAdapter
 
 #include "logger.hpp"    // GAPI_LOG
 
@@ -357,22 +356,7 @@ void GIslandExecutable::run(GIslandExecutable::IInput &in, GIslandExecutable::IO
     for (auto &&it: ade::util::zip(ade::util::toRange(in_desc),
                                    ade::util::toRange(in_vector)))
     {
-        const cv::GRunArg& in_data_orig = std::get<1>(it);
-        cv::GRunArg in_data;
-        switch (in_data_orig.index())
-        {
-        case cv::GRunArg::index_of<cv::Mat>():
-            // FIXME: This whole construct is ugly, from
-            // its writing to a need in this in general
-            in_data = cv::GRunArg{ cv::make_rmat<cv::gimpl::RMatAdapter>(cv::util::get<cv::Mat>(in_data_orig))
-                                 , in_data_orig.meta
-                                 };
-            break;
-        default:
-            in_data = in_data_orig;
-            break;
-        }
-        in_objs.emplace_back(std::get<0>(it), std::move(in_data));
+        in_objs.emplace_back(std::get<0>(it), std::get<1>(it));
     }
     for (auto &&it: ade::util::indexed(ade::util::toRange(out_desc)))
     {
diff --git a/modules/gapi/src/compiler/gmodelbuilder.cpp b/modules/gapi/src/compiler/gmodelbuilder.cpp
index 5f8f3518fc..f0e4917d7a 100644
--- a/modules/gapi/src/compiler/gmodelbuilder.cpp
+++ b/modules/gapi/src/compiler/gmodelbuilder.cpp
@@ -135,7 +135,7 @@ cv::gimpl::Unrolled cv::gimpl::unrollExpr(const GProtoArgs &ins,
                 // Put the outputs object description of the node
                 // so that they are not lost if they are not consumed by other operations
                 GAPI_Assert(call_p.m_k.outCtors.size() == call_p.m_k.outShapes.size());
-                for (const auto &it : ade::util::indexed(call_p.m_k.outShapes))
+                for (const auto it : ade::util::indexed(call_p.m_k.outShapes))
                 {
                     std::size_t port  = ade::util::index(it);
                     GShape shape      = ade::util::value(it);
@@ -212,7 +212,7 @@ cv::gimpl::GModelBuilder::put(const GProtoArgs &ins, const GProtoArgs &outs)
         const GCall::Priv&  call_p  = call.priv();
         ade::NodeHandle     call_h  = put_OpNode(op_expr_node);
 
-        for (const auto &it : ade::util::indexed(call_p.m_args))
+        for (const auto it : ade::util::indexed(call_p.m_args))
         {
             const auto  in_port = ade::util::index(it);
             const auto& in_arg  = ade::util::value(it);
diff --git a/modules/gapi/src/compiler/gstreaming.cpp b/modules/gapi/src/compiler/gstreaming.cpp
index fa736d592e..3bdc0323b5 100644
--- a/modules/gapi/src/compiler/gstreaming.cpp
+++ b/modules/gapi/src/compiler/gstreaming.cpp
@@ -96,6 +96,12 @@ cv::GStreamingCompiled::GStreamingCompiled()
 {
 }
 
+// NB: This overload is called from python code
+void cv::GStreamingCompiled::setSource(const cv::detail::ExtractArgsCallback& callback)
+{
+    setSource(callback(m_priv->inInfo()));
+}
+
 void cv::GStreamingCompiled::setSource(GRunArgs &&ins)
 {
     // FIXME: verify these input parameters according to the graph input meta
@@ -119,46 +125,13 @@ bool cv::GStreamingCompiled::pull(cv::GRunArgsP &&outs)
 
 std::tuple<bool, cv::GRunArgs> cv::GStreamingCompiled::pull()
 {
-    // FIXME: Why it is not @ priv??
     GRunArgs run_args;
     GRunArgsP outs;
     const auto& out_info = m_priv->outInfo();
     run_args.reserve(out_info.size());
     outs.reserve(out_info.size());
 
-    for (auto&& info : out_info)
-    {
-        switch (info.shape)
-        {
-            case cv::GShape::GMAT:
-            {
-                run_args.emplace_back(cv::Mat{});
-                outs.emplace_back(&cv::util::get<cv::Mat>(run_args.back()));
-                break;
-            }
-            case cv::GShape::GSCALAR:
-            {
-                run_args.emplace_back(cv::Scalar{});
-                outs.emplace_back(&cv::util::get<cv::Scalar>(run_args.back()));
-                break;
-            }
-            case cv::GShape::GARRAY:
-            {
-                switch (info.kind)
-                {
-                    case cv::detail::OpaqueKind::CV_POINT2F:
-                        run_args.emplace_back(cv::detail::VectorRef{std::vector<cv::Point2f>{}});
-                        outs.emplace_back(cv::util::get<cv::detail::VectorRef>(run_args.back()));
-                        break;
-                    default:
-                        util::throw_error(std::logic_error("Unsupported kind for GArray"));
-                }
-                break;
-            }
-            default:
-                util::throw_error(std::logic_error("Only cv::GMat and cv::GScalar are supported for python output"));
-        }
-    }
+    cv::detail::constructGraphOutputs(m_priv->outInfo(), run_args, outs);
 
     bool is_over = m_priv->pull(std::move(outs));
     return std::make_tuple(is_over, run_args);
diff --git a/modules/gapi/src/compiler/gstreaming_priv.hpp b/modules/gapi/src/compiler/gstreaming_priv.hpp
index be0869e663..59b19d4252 100644
--- a/modules/gapi/src/compiler/gstreaming_priv.hpp
+++ b/modules/gapi/src/compiler/gstreaming_priv.hpp
@@ -20,7 +20,7 @@ namespace gimpl
 
 // FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
 // FIXME: It seems it clearly duplicates the GStreamingCompiled and
-// GStreamingExecutable APIs so is highly redundant now.
+// GStreamingIntrinExecutable APIs so is highly redundant now.
 // Same applies to GCompiled/GCompiled::Priv/GExecutor.
 class GAPI_EXPORTS GStreamingCompiled::Priv
 {
diff --git a/modules/gapi/src/compiler/passes/intrin.cpp b/modules/gapi/src/compiler/passes/intrin.cpp
index 5d2707570a..56f2db69e0 100644
--- a/modules/gapi/src/compiler/passes/intrin.cpp
+++ b/modules/gapi/src/compiler/passes/intrin.cpp
@@ -201,7 +201,7 @@ void traceDown(cv::gimpl::GModel::Graph &g,
 
 // Streaming case: ensure the graph has proper isolation of the
 // desynchronized parts, set proper Edge metadata hints for
-// GStreamingExecutable
+// GStreamingIntrinExecutable
 void apply(cv::gimpl::GModel::Graph &g) {
     using namespace cv::gimpl;
 
diff --git a/modules/gapi/src/compiler/passes/kernels.cpp b/modules/gapi/src/compiler/passes/kernels.cpp
index 837e21f19a..4298b2e527 100644
--- a/modules/gapi/src/compiler/passes/kernels.cpp
+++ b/modules/gapi/src/compiler/passes/kernels.cpp
@@ -25,15 +25,18 @@
 #include "logger.hpp"    // GAPI_LOG
 #include "api/gproto_priv.hpp" // is_dynamic, rewrap
 
-namespace
+static
+const std::vector<std::string>& getKnownIntrinsics()
 {
     // FIXME: This may be not the right design choice, but so far it works
-    const std::vector<std::string> known_intrinsics = {
+    static const std::vector<std::string> known_intrinsics = {
         cv::gapi::streaming::detail::GDesync::id()
     };
+    return known_intrinsics;
 }
 bool cv::gimpl::is_intrinsic(const std::string &s) {
     // FIXME: This search might be better in time once we start using string
+    const std::vector<std::string>& known_intrinsics = getKnownIntrinsics();
     return std::find(known_intrinsics.begin(),
                      known_intrinsics.end(),
                      s) != known_intrinsics.end();
@@ -94,7 +97,7 @@ namespace
 
         // Reconnect expanded kernels from graph data objects
         // to subgraph data objects, then drop that graph data objects
-        for (const auto& it : ade::util::zip(in_nhs, sorted_in_nhs))
+        for (const auto it : ade::util::zip(in_nhs, sorted_in_nhs))
         {
             const auto& subgr_in_nh = std::get<0>(it);
             const auto& comp_in_nh  = std::get<1>(it);
@@ -105,7 +108,7 @@ namespace
 
         gr.erase(nh);
 
-        for (const auto& it : ade::util::zip(out_nhs, sorted_out_nhs))
+        for (const auto it : ade::util::zip(out_nhs, sorted_out_nhs))
         {
             const auto& subgr_out_nh = std::get<0>(it);
             const auto& comp_out_nh  = std::get<1>(it);
diff --git a/modules/gapi/src/compiler/passes/meta.cpp b/modules/gapi/src/compiler/passes/meta.cpp
index 51d4c888bb..6e5bf0ab64 100644
--- a/modules/gapi/src/compiler/passes/meta.cpp
+++ b/modules/gapi/src/compiler/passes/meta.cpp
@@ -24,7 +24,7 @@ void cv::gimpl::passes::initMeta(ade::passes::PassContext &ctx, const GMetaArgs
 
     const auto &proto = gr.metadata().get<Protocol>();
 
-    for (const auto& it : ade::util::indexed(proto.in_nhs))
+    for (const auto it : ade::util::indexed(proto.in_nhs))
     {
         auto& data = gr.metadata(ade::util::value(it)).get<Data>();
         data.meta = metas.at(ade::util::index(it));
@@ -125,7 +125,7 @@ void cv::gimpl::passes::storeResultingMeta(ade::passes::PassContext &ctx)
     const auto &proto = gr.metadata().get<Protocol>();
     GMetaArgs output_metas(proto.out_nhs.size());
 
-    for (const auto& it : ade::util::indexed(proto.out_nhs))
+    for (const auto it : ade::util::indexed(proto.out_nhs))
     {
         auto& data = gr.metadata(ade::util::value(it)).get<Data>();
         output_metas[ade::util::index(it)] = data.meta;
diff --git a/modules/gapi/src/compiler/passes/streaming.cpp b/modules/gapi/src/compiler/passes/streaming.cpp
index 6e982e2553..9d5dd713c4 100644
--- a/modules/gapi/src/compiler/passes/streaming.cpp
+++ b/modules/gapi/src/compiler/passes/streaming.cpp
@@ -32,7 +32,7 @@ namespace cv { namespace gimpl { namespace passes {
  * connected to a new "Sink" node which becomes its _consumer_.
  *
  * These extra nodes are required to streamline the queues
- * initialization by the GStreamingExecutable and its derivatives.
+ * initialization by the GStreamingIntrinExecutable and its derivatives.
  */
 void addStreaming(ade::passes::PassContext &ctx)
 {
diff --git a/modules/gapi/src/executor/gapi_itt.hpp b/modules/gapi/src/executor/gapi_itt.hpp
new file mode 100644
index 0000000000..2ab3237e7f
--- /dev/null
+++ b/modules/gapi/src/executor/gapi_itt.hpp
@@ -0,0 +1,59 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_GAPI_ITT_HPP
+#define OPENCV_GAPI_GAPI_ITT_HPP
+
+//for ITT_NAMED_TRACE_GUARD
+#include <type_traits>
+#include <memory>
+
+// FIXME: It seems that this macro is not propagated here by the OpenCV cmake (as this is not core module).
+// (Consider using OpenCV's trace.hpp )
+#ifdef OPENCV_WITH_ITT
+#include <ittnotify.h>
+#endif
+
+#include <opencv2/gapi/util/compiler_hints.hpp>
+namespace cv {
+namespace util {
+    template< class T >
+    using remove_reference_t = typename std::remove_reference<T>::type;
+
+    // Home brew ScopeGuard
+    // D will be called automatically with p as argument when ScopeGuard goes out of scope.
+    // call release() on the ScopeGuard object to revoke guard action
+    template<typename T, typename D>
+    auto make_ptr_guard(T* p, D&& d) -> std::unique_ptr<T, util::remove_reference_t<D>> {
+        return {p, std::forward<D>(d)};
+    }
+}  // namespace util
+
+// FIXME: make it more reusable (and move to other place and other namespace)
+namespace gimpl { namespace parallel {
+    #ifdef OPENCV_WITH_ITT
+    extern const __itt_domain* gapi_itt_domain;
+
+    namespace {
+        auto make_itt_guard = [](__itt_string_handle* h) {
+           __itt_task_begin(gapi_itt_domain, __itt_null, __itt_null, (h));
+           return util::make_ptr_guard(reinterpret_cast<int*>(1), [](int* ) { __itt_task_end(gapi_itt_domain); });
+        };
+    }  // namespace
+
+    #define GAPI_ITT_NAMED_TRACE_GUARD(name, h)  auto name =  cv::gimpl::parallel::make_itt_guard(h); cv::util::suppress_unused_warning(name)
+    #else
+    struct dumb_guard {void reset(){}};
+    #define GAPI_ITT_NAMED_TRACE_GUARD(name, h)  cv::gimpl::parallel::dumb_guard name; cv::util::suppress_unused_warning(name)
+    #endif
+
+    #define GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h)        GAPI_ITT_NAMED_TRACE_GUARD(itt_trace_guard_##LINE, h)
+    #define GAPI_ITT_AUTO_TRACE_GUARD_IMPL(LINE, h)         GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h)
+    #define GAPI_ITT_AUTO_TRACE_GUARD(h)                    GAPI_ITT_AUTO_TRACE_GUARD_IMPL(__LINE__, h)
+}} //gimpl::parallel
+}  //namespace cv
+
+#endif /* OPENCV_GAPI_GAPI_ITT_HPP */
diff --git a/modules/gapi/src/executor/gasync.cpp b/modules/gapi/src/executor/gasync.cpp
index b92dbdcec4..da8de7e097 100644
--- a/modules/gapi/src/executor/gasync.cpp
+++ b/modules/gapi/src/executor/gasync.cpp
@@ -11,6 +11,8 @@
 #include <opencv2/gapi/gcompiled.hpp>
 #include <opencv2/gapi/gasync_context.hpp>
 
+#include <opencv2/gapi/util/copy_through_move.hpp>
+
 #include <condition_variable>
 
 #include <future>
@@ -18,16 +20,6 @@
 #include <stdexcept>
 #include <queue>
 
-namespace {
-    //This is a tool to move initialize captures of a lambda in C++11
-    template<typename T>
-    struct copy_through_move{
-       T value;
-       copy_through_move(T&& g) : value(std::move(g)) {}
-       copy_through_move(copy_through_move&&) = default;
-       copy_through_move(copy_through_move const& lhs) : copy_through_move(std::move(const_cast<copy_through_move&>(lhs))) {}
-    };
-}
 
 namespace cv {
 namespace gapi {
@@ -45,9 +37,16 @@ class async_service {
 
     std::thread thrd;
 
-public:
     async_service() = default ;
 
+public:
+    // singleton
+    static async_service& instance()
+    {
+        static async_service the_ctx;
+        return the_ctx;
+    }
+
     void add_task(std::function<void()>&& t){
         if (!thread_started)
         {
@@ -95,6 +94,8 @@ public:
             cv.notify_one();
         }
     }
+
+protected:
     ~async_service(){
         if (thread_started && thrd.joinable())
         {
@@ -107,7 +108,6 @@ public:
     }
 };
 
-async_service the_ctx;
 }
 
 namespace {
@@ -168,7 +168,7 @@ const char* GAsyncCanceled::what() const noexcept {
 //For now these async functions are simply wrapping serial version of apply/operator() into a functor.
 //These functors are then serialized into single queue, which is processed by a devoted background thread.
 void async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args){
-    //TODO: use copy_through_move for all args except gcomp
+    //TODO: use copy_through_move_t for all args except gcomp
     //TODO: avoid code duplication between versions of "async" functions
     auto l = [=]() mutable {
         auto apply_l = [&](){
@@ -177,11 +177,11 @@ void async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&&
 
         call_with_callback(apply_l,std::move(callback), DummyContext{});
     };
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
 }
 
 std::future<void> async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args){
-    copy_through_move<std::promise<void>> prms{{}};
+    util::copy_through_move_t<std::promise<void>> prms{{}};
     auto f = prms.value.get_future();
     auto l = [=]() mutable {
         auto apply_l = [&](){
@@ -191,12 +191,12 @@ std::future<void> async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&o
         call_with_future(apply_l, prms.value, DummyContext{});
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
     return f;
 }
 
 void async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args, GAsyncContext& ctx){
-    //TODO: use copy_through_move for all args except gcomp
+    //TODO: use copy_through_move_t for all args except gcomp
     auto l = [=, &ctx]() mutable {
         auto apply_l = [&](){
             gcomp.apply(std::move(ins), std::move(outs), std::move(args));
@@ -204,11 +204,11 @@ void async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&&
 
         call_with_callback(apply_l,std::move(callback), ctx);
     };
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
 }
 
 std::future<void> async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args, GAsyncContext& ctx){
-    copy_through_move<std::promise<void>> prms{{}};
+    util::copy_through_move_t<std::promise<void>> prms{{}};
     auto f = prms.value.get_future();
     auto l = [=, &ctx]() mutable {
         auto apply_l = [&](){
@@ -218,7 +218,7 @@ std::future<void> async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&o
         call_with_future(apply_l, prms.value, ctx);
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
     return f;
 
 }
@@ -232,7 +232,7 @@ void async(GCompiled& gcmpld, std::function<void(std::exception_ptr)>&& callback
         call_with_callback(apply_l,std::move(callback), DummyContext{});
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
 }
 
 void async(GCompiled& gcmpld, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GAsyncContext& ctx){
@@ -244,11 +244,11 @@ void async(GCompiled& gcmpld, std::function<void(std::exception_ptr)>&& callback
         call_with_callback(apply_l,std::move(callback), ctx);
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
 }
 
 std::future<void> async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs){
-    copy_through_move<std::promise<void>> prms{{}};
+    util::copy_through_move_t<std::promise<void>> prms{{}};
     auto f = prms.value.get_future();
     auto l = [=]() mutable {
         auto apply_l = [&](){
@@ -258,12 +258,12 @@ std::future<void> async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs){
         call_with_future(apply_l, prms.value, DummyContext{});
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
     return f;
 
 }
 std::future<void> async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs, GAsyncContext& ctx){
-    copy_through_move<std::promise<void>> prms{{}};
+    util::copy_through_move_t<std::promise<void>> prms{{}};
     auto f = prms.value.get_future();
     auto l = [=, &ctx]() mutable {
         auto apply_l = [&](){
@@ -273,7 +273,7 @@ std::future<void> async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs, GAs
         call_with_future(apply_l, prms.value, ctx);
     };
 
-    impl::the_ctx.add_task(l);
+    impl::async_service::instance().add_task(l);
     return f;
 
 }
diff --git a/modules/gapi/src/executor/gexecutor.cpp b/modules/gapi/src/executor/gexecutor.cpp
index 66f3b24771..6f313197ba 100644
--- a/modules/gapi/src/executor/gexecutor.cpp
+++ b/modules/gapi/src/executor/gexecutor.cpp
@@ -146,16 +146,26 @@ void writeBackExec(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg)
         writeBack(mag, rc, g_arg);
         return;
     }
-    auto checkOutArgData = [&](const uchar* out_arg_data) {
-        //simply check that memory was not reallocated, i.e.
-        //both Mat and View pointing to the same memory
-        auto mag_data = mag.template slot<cv::RMat>().at(rc.id).get<RMatAdapter>()->data();
-        GAPI_Assert((out_arg_data == mag_data) && " data for output parameters was reallocated ?");
-    };
 
     switch (g_arg.index())
     {
-    case GRunArgP::index_of<cv::Mat*>() : checkOutArgData(util::get<cv::Mat*>(g_arg)->data); break;
+    case GRunArgP::index_of<cv::Mat*>() : {
+        // If there is a copy intrinsic at the end of the graph
+        // we need to actualy copy the data to the user buffer
+        // since output runarg was optimized to simply point
+        // to the input of the copy kernel
+        // FIXME:
+        // Rework, find a better way to check if there should be
+        // a real copy (add a pass to StreamingBackend?)
+        auto& out_mat = *util::get<cv::Mat*>(g_arg);
+        const auto& rmat = mag.template slot<cv::RMat>().at(rc.id);
+        auto mag_data = rmat.get<RMatAdapter>()->data();
+        if (out_mat.data != mag_data) {
+            auto view = rmat.access(RMat::Access::R);
+            asMat(view).copyTo(out_mat);
+        }
+        break;
+    }
     case GRunArgP::index_of<cv::RMat*>() : /* do nothing */ break;
     default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
     }
@@ -232,7 +242,10 @@ void cv::gimpl::GExecutor::initResource(const ade::NodeHandle & nh, const ade::N
     case GShape::GOPAQUE:
         // Constructed on Reset, do nothing here
         break;
-
+    case GShape::GFRAME: {
+        // Should be defined by backend, do nothing here
+        break;
+    }
     default:
         GAPI_Assert(false);
     }
diff --git a/modules/gapi/src/executor/gstreamingexecutor.cpp b/modules/gapi/src/executor/gstreamingexecutor.cpp
index 70686699d0..2a06873fee 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@@ -7,7 +7,6 @@
 #include "precomp.hpp"
 
 #include <memory> // make_shared
-#include <iostream>
 
 #include <ade/util/zip_range.hpp>
 
@@ -20,10 +19,14 @@
 #include "api/gproto_priv.hpp" // ptr(GRunArgP)
 #include "compiler/passes/passes.hpp"
 #include "backends/common/gbackend.hpp" // createMat
+#include "backends/streaming/gstreamingbackend.hpp" // GCopy
 #include "compiler/gcompiler.hpp" // for compileIslands
 
 #include "executor/gstreamingexecutor.hpp"
 
+#include <opencv2/gapi/streaming/meta.hpp>
+#include <opencv2/gapi/streaming/sync.hpp>
+
 namespace
 {
 using namespace cv::gimpl::stream;
@@ -144,6 +147,9 @@ void sync_data(cv::GRunArgs &results, cv::GRunArgsP &outputs)
         case T::index_of<cv::detail::OpaqueRef>():
             cv::util::get<cv::detail::OpaqueRef>(out_obj).mov(cv::util::get<cv::detail::OpaqueRef>(res_obj));
             break;
+        case T::index_of<cv::MediaFrame*>():
+            *cv::util::get<cv::MediaFrame*>(out_obj) = std::move(cv::util::get<cv::MediaFrame>(res_obj));
+            break;
         default:
             GAPI_Assert(false && "This value type is not supported!"); // ...maybe because of STANDALONE mode.
             break;
@@ -309,12 +315,8 @@ public:
                           cv::GRunArgs            &out_results);
 };
 
-// This method handles a stop sign got from some input
-// island. Reiterate through all _remaining valid_ queues (some of
-// them can be set to nullptr already -- see handling in
-// getInputVector) and rewind data to every Stop sign per queue.
-void QueueReader::rewindToStop(std::vector<Q*>   &in_queues,
-                               const std::size_t  this_id)
+void rewindToStop(std::vector<Q*> &in_queues,
+                  const std::size_t  this_id)
 {
     for (auto &&qit : ade::util::indexed(in_queues))
     {
@@ -328,6 +330,16 @@ void QueueReader::rewindToStop(std::vector<Q*>   &in_queues,
     }
 }
 
+// This method handles a stop sign got from some input
+// island. Reiterate through all _remaining valid_ queues (some of
+// them can be set to nullptr already -- see handling in
+// getInputVector) and rewind data to every Stop sign per queue.
+void QueueReader::rewindToStop(std::vector<Q*>   &in_queues,
+                               const std::size_t  this_id)
+{
+    ::rewindToStop(in_queues, this_id);
+}
+
 bool QueueReader::getInputVector(std::vector<Q*> &in_queues,
                                  cv::GRunArgs    &in_constants,
                                  cv::GRunArgs    &isl_inputs)
@@ -493,7 +505,7 @@ void emitterActorThread(std::shared_ptr<cv::gimpl::GIslandEmitter> emitter,
             return;
         }
 
-        // Try to obrain next data chunk from the source
+        // Try to obtain next data chunk from the source
         cv::GRunArg data;
         if (emitter->pull(data))
         {
@@ -518,6 +530,87 @@ void emitterActorThread(std::shared_ptr<cv::gimpl::GIslandEmitter> emitter,
     }
 }
 
+// This thread pulls data from the assigned input queues and makes sure that
+// all input args are in sync (timestamps are equal), dropping some inputs if required.
+// After getting synchronized inputs from all input queues, the thread pushes them to out queues
+void syncActorThread(std::vector<Q*> in_queues,
+                     std::vector<std::vector<Q*>> out_queues) {
+    using timestamp_t = int64_t;
+    std::vector<bool> pop_nexts(in_queues.size());
+    std::vector<Cmd> cmds(in_queues.size());
+
+    while (true) {
+        // pop_nexts indicates which queue still contains earlier timestamps and
+        // needs to be popped at least one more time.
+        // For each iteration (frame) we need to pull from each input queue at least once,
+        // so switch all to true when start processing new frame
+        for (auto&& p : pop_nexts) {
+            p = true;
+        }
+        timestamp_t max_ts = 0u;
+        // Iterate through all input queues, pop GRunArg's and compare timestamps.
+        // Continue pulling from queues whose timestamps are smaller.
+        // Finish when all timestamps are equal.
+        do {
+            for (auto&& it : ade::util::indexed(
+                                 ade::util::zip(pop_nexts, in_queues, cmds))) {
+                auto& val = ade::util::value(it);
+                auto& pop_next = std::get<0>(val);
+                if (!pop_next) {
+                    continue;
+                }
+                auto& q   = std::get<1>(val);
+                auto& cmd = std::get<2>(val);
+
+                q->pop(cmd);
+                if (cv::util::holds_alternative<Stop>(cmd)) {
+                    // We got a stop command from one of the input queues.
+                    // Rewind all input queues till Stop command,
+                    // Push Stop command down the graph, finish the thread
+                    rewindToStop(in_queues, ade::util::index(it));
+                    for (auto &&oqs : out_queues) {
+                        for (auto &&oq : oqs) {
+                            oq->push(Cmd{Stop{}});
+                        }
+                    }
+                    return;
+                }
+
+                // Extract the timestamp
+                auto& arg = cv::util::get<cv::GRunArg>(cmd);
+                auto ts = cv::util::any_cast<int64_t>(arg.meta[cv::gapi::streaming::meta_tag::timestamp]);
+                GAPI_Assert(ts >= 0u);
+
+                // TODO: this whole drop logic can be imported via compile args
+                // to give a user a way to customize it
+                if (ts < max_ts) {
+                    // Continue popping from this queue
+                    pop_next = true;
+                } else if (ts == max_ts) {
+                    // Stop popping from this queue
+                    pop_next = false;
+                } else if (ts > max_ts) {
+                    // We got a timestamp which is greater than timestamps from other queues.
+                    // It means that we need to reiterate through all the queues one more time
+                    // (except the current one)
+                    max_ts = ts;
+                    for (auto&& p : pop_nexts) {
+                        p = true;
+                    }
+                    pop_next = false;
+                }
+            }
+        } while (ade::util::any_of(pop_nexts, [](bool v){ return v; }));
+
+        // Finally we got all our inputs synchronized, push them further down the graph
+        for (auto &&it : ade::util::zip(out_queues, cmds)) {
+            for (auto &&q : std::get<0>(it)) {
+                q->push(std::get<1>(it));
+            }
+        }
+    }
+}
+
 class StreamingInput final: public cv::gimpl::GIslandExecutable::IInput
 {
     QueueReader &qr;
@@ -532,6 +625,14 @@ class StreamingInput final: public cv::gimpl::GIslandExecutable::IInput
             // Stop case
             return cv::gimpl::StreamMsg{cv::gimpl::EndOfStream{}};
         }
+        // Wrap all input cv::Mats with RMats
+        for (auto& arg : isl_input_args) {
+            if (arg.index() == cv::GRunArg::index_of<cv::Mat>()) {
+                arg = cv::GRunArg{ cv::make_rmat<cv::gimpl::RMatAdapter>(cv::util::get<cv::Mat>(arg))
+                                 , arg.meta
+                                 };
+            }
+        }
         return cv::gimpl::StreamMsg{std::move(isl_input_args)};
     }
     virtual cv::gimpl::StreamMsg try_get() override
@@ -571,10 +672,16 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
     std::vector< std::vector<Q*> > &m_out_queues;
     std::shared_ptr<cv::gimpl::GIslandExecutable> m_island;
 
+    // NB: StreamingOutput have to be thread-safe.
+    // Now synchronization approach is quite poor and inefficient.
+    mutable std::mutex m_mutex;
+
     // Allocate a new data object for output under idx
     // Prepare this object for posting
     virtual cv::GRunArgP get(int idx) override
     {
+        std::lock_guard<std::mutex> lock{m_mutex};
+
         using MatType = cv::Mat;
         using SclType = cv::Scalar;
 
@@ -636,6 +743,13 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
                 ret_val = cv::GRunArgP(rr);
             }
             break;
+        case cv::GShape::GFRAME:
+            {
+                cv::MediaFrame frame;
+                out_arg = cv::GRunArg(std::move(frame));
+                ret_val = cv::GRunArgP(&cv::util::get<cv::MediaFrame>(out_arg));
+            }
+            break;
         default:
             cv::util::throw_error(std::logic_error("Unsupported GShape"));
         }
@@ -644,6 +758,8 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
     }
     virtual void post(cv::GRunArgP&& argp) override
     {
+        std::lock_guard<std::mutex> lock{m_mutex};
+
         // Mark the output ready for posting. If it is the first in the line,
         // actually post it and all its successors which are ready for posting too.
         auto it = m_postIdx.find(cv::gimpl::proto::ptr(argp));
@@ -681,6 +797,7 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
     }
     virtual void post(cv::gimpl::EndOfStream&&) override
     {
+        std::lock_guard<std::mutex> lock{m_mutex};
         // If the posting list is empty, just broadcast the stop message.
         // If it is not, enqueue the Stop message in the postings list.
         for (auto &&it : ade::util::indexed(m_postings))
@@ -706,6 +823,7 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
     }
     void meta(const cv::GRunArgP &out, const cv::GRunArg::Meta &m) override
     {
+        std::lock_guard<std::mutex> lock{m_mutex};
         const auto it = m_postIdx.find(cv::gimpl::proto::ptr(out));
         GAPI_Assert(it != m_postIdx.end());
 
@@ -728,6 +846,7 @@ public:
 
     bool done() const
     {
+        std::lock_guard<std::mutex> lock{m_mutex};
         // The streaming actor work is considered DONE for this stream
         // when it posted/resent all STOP messages to all its outputs.
         return m_stops_sent == desc().size();
@@ -845,6 +964,85 @@ void check_DesyncObjectConsumedByMultipleIslands(const cv::gimpl::GIslandModel::
 
 } // anonymous namespace
 
+class cv::gimpl::GStreamingExecutor::Synchronizer final {
+    gapi::streaming::sync_policy m_sync_policy = gapi::streaming::sync_policy::dont_sync;
+    ade::Graph& m_island_graph;
+    cv::gimpl::GIslandModel::Graph m_gim;
+    std::size_t m_queue_capacity = 0u;
+    std::thread m_thread;
+
+    std::vector<ade::NodeHandle> m_synchronized_emitters;
+    std::vector<stream::SyncQueue> m_sync_queues;
+
+    std::vector<stream::Q*> newSyncQueue() {
+        m_sync_queues.emplace_back(SyncQueue{});
+        m_sync_queues.back().set_capacity(m_queue_capacity);
+        return std::vector<Q*>{&m_sync_queues.back()};
+    }
+public:
+    Synchronizer(gapi::streaming::sync_policy sync_policy,
+                 ade::Graph& island_graph,
+                 std::size_t queue_capacity)
+        : m_sync_policy(sync_policy)
+        , m_island_graph(island_graph)
+        , m_gim(m_island_graph)
+        , m_queue_capacity(queue_capacity) {
+    }
+
+    void registerVideoEmitters(std::vector<ade::NodeHandle>&& emitters) {
+        // There is no point to make synchronization for the one video input
+        // so do nothing in this case
+        if (   m_sync_policy == cv::gapi::streaming::sync_policy::drop
+            && emitters.size() > 1u) {
+            m_synchronized_emitters = std::move(emitters);
+            m_sync_queues.reserve(m_synchronized_emitters.size());
+        }
+    }
+
+    std::vector<stream::Q*> outQueues(const ade::NodeHandle& emitter) {
+        // If the emitter was registered previously (which means it needs to be synchronized),
+        // create a new queue for this emitter to push the data to. Sync thread will
+        // pop from this queue and push data to emitter's readers.
+        // If the emitter was not registered, direct emitter output to its immediate readers right away
+        return m_synchronized_emitters.end() != std::find(m_synchronized_emitters.begin(),
+                                                          m_synchronized_emitters.end(),
+                                                          emitter)
+               ? newSyncQueue()
+               : reader_queues(m_island_graph, emitter->outNodes().front());
+    }
+
+    // Start a thread which will handle the synchronization.
+    // Do nothing if synchronization is not needed
+    void start() {
+        if (m_synchronized_emitters.size() != 0) {
+            GAPI_Assert(m_synchronized_emitters.size() > 1u);
+            std::vector<Q*> sync_in_queues(m_synchronized_emitters.size());
+            std::vector<std::vector<Q*>> sync_out_queues(m_synchronized_emitters.size());
+            for (auto it : ade::util::indexed(m_synchronized_emitters)) {
+                const auto id = ade::util::index(it);
+                const auto eh = ade::util::value(it);
+                sync_in_queues[id] = &m_sync_queues[id];
+                sync_out_queues[id] = reader_queues(m_island_graph, eh->outNodes().front());
+            }
+            m_thread = std::thread(syncActorThread,
+                                   std::move(sync_in_queues),
+                                   std::move(sync_out_queues));
+        }
+    }
+
+    void join() {
+        if (m_synchronized_emitters.size() != 0) {
+            m_thread.join();
+        }
+    }
+
+    void clear() {
+        for (auto &q : m_sync_queues) q.clear();
+        m_sync_queues.clear();
+        m_synchronized_emitters.clear();
+    }
+};
+
 // GStreamingExecutor expects compile arguments as input to have possibility to do
 // proper graph reshape and islands recompilation
 cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&g_model,
@@ -882,6 +1080,10 @@ cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&
             return m_gim.metadata(nh).get<NodeKind>().k == NodeKind::ISLAND;
          });
 
+    auto sync_policy = cv::gimpl::getCompileArg<cv::gapi::streaming::sync_policy>(m_comp_args)
+                       .value_or(cv::gapi::streaming::sync_policy::dont_sync);
+    m_sync.reset(new Synchronizer(sync_policy, *m_island_graph, queue_capacity));
+
     // If metadata was not passed to compileStreaming, Islands are not compiled at this point.
     // It is fine -- Islands are then compiled in setSource (at the first valid call).
     const bool islands_compiled = m_gim.metadata().contains<IslandsCompiled>();
@@ -905,7 +1107,7 @@ cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&
                 std::unordered_set<ade::NodeHandle, ade::HandleHasher<ade::Node> > const_ins;
 
                 // FIXME: THIS ORDER IS IRRELEVANT TO PROTOCOL OR ANY OTHER ORDER!
-                // FIXME: SAME APPLIES TO THE REGULAR GEEXECUTOR!!
+                // FIXME: SAME APPLIES TO THE REGULAR GEXECUTOR!!
                 auto xtract_in = [&](ade::NodeHandle slot_nh, std::vector<RcDesc> &vec)
                 {
                     const auto orig_data_nh
@@ -986,12 +1188,10 @@ cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&
                     // In the current implementation, such islands
                     // _must_ start with copy
                     GAPI_Assert(isl->in_ops().size() == 1u);
-#if !defined(GAPI_STANDALONE)
                     GAPI_Assert(GModel::Graph(*m_orig_graph)
                                 .metadata(*isl->in_ops().begin())
                                 .get<cv::gimpl::Op>()
-                                .k.name == cv::gapi::core::GCopy::id());
-#endif // GAPI_STANDALONE
+                                .k.name == cv::gimpl::streaming::GCopy::id());
                     for (auto out_nh : nh->outNodes()) {
                         for (auto out_eh : out_nh->outEdges()) {
                             qgr.metadata(out_eh).set(DesyncSpecialCase{});
@@ -1074,19 +1274,6 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
 {
     GAPI_Assert(state == State::READY || state == State::STOPPED);
 
-    const auto is_video = [](const GRunArg &arg)
-    {
-        return util::holds_alternative<cv::gapi::wip::IStreamSource::Ptr>(arg);
-    };
-    const auto num_videos = std::count_if(ins.begin(), ins.end(), is_video);
-    if (num_videos > 1)
-    {
-        // See below why (another reason - no documented behavior
-        // on handling videos streams of different length)
-        util::throw_error(std::logic_error("Only one video source is"
-                                           " currently supported!"));
-    }
-
     GModel::ConstGraph gm(*m_orig_graph);
     // Now the tricky-part: completing Islands compilation if compileStreaming
     // has been called without meta arguments.
@@ -1153,6 +1340,8 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
 
     // Walk through the protocol, set-up emitters appropriately
     // There's a 1:1 mapping between emitters and corresponding data inputs.
+    // Also collect video emitter nodes to use them later in synchronization
+    std::vector<ade::NodeHandle> video_emitters;
     for (auto it : ade::util::zip(ade::util::toRange(m_emitters),
                                   ade::util::toRange(ins),
                                   ade::util::iota(m_emitters.size())))
@@ -1170,6 +1359,9 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
         case T::index_of<cv::gapi::wip::IStreamSource::Ptr>():
 #if !defined(GAPI_STANDALONE)
             emitter.reset(new VideoEmitter{emit_arg});
+            // Currently all video inputs are syncronized if sync policy is to drop,
+            // there is no different fps branches etc, so all video emitters are registered
+            video_emitters.emplace_back(emit_nh);
 #else
             util::throw_error(std::logic_error("Video is not supported in the "
                                                "standalone mode"));
@@ -1185,6 +1377,8 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
         }
     }
 
+    m_sync->registerVideoEmitters(std::move(video_emitters));
+
     // FIXME: The below code assumes our graph may have only one
     // real video source (and so, only one stream which may really end)
     // all other inputs are "constant" generators.
@@ -1222,7 +1416,7 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
         auto emitter = m_gim.metadata(eh).get<Emitter>().object;
 
         // Collect all reader queues from the emitter's the only output object
-        auto out_queues = reader_queues(*m_island_graph, eh->outNodes().front());
+        auto out_queues = m_sync->outQueues(eh);
 
         m_threads.emplace_back(emitterActorThread,
                                emitter,
@@ -1231,6 +1425,8 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
                                real_video_completion_cb);
     }
 
+    m_sync->start();
+
     // Now do this for every island (in a topological order)
     for (auto &&op : m_ops)
     {
@@ -1314,6 +1510,7 @@ void cv::gimpl::GStreamingExecutor::wait_shutdown()
     // FIXME: Of course it can be designed much better
     for (auto &t : m_threads) t.join();
     m_threads.clear();
+    m_sync->join();
 
     // Clear all queues
     // If there are constant emitters, internal queues
@@ -1325,7 +1522,10 @@ void cv::gimpl::GStreamingExecutor::wait_shutdown()
     for (auto &q : m_emitter_queues) q.clear();
     for (auto &q : m_sink_queues) q->clear();
     for (auto &q : m_internal_queues) q->clear();
+    m_const_emitter_queues.clear();
+    m_const_vals.clear();
     m_out_queue.clear();
+    m_sync->clear();
 
     for (auto &&op : m_ops) {
         op.isl_exec->handleStopStream();
diff --git a/modules/gapi/src/executor/gstreamingexecutor.hpp b/modules/gapi/src/executor/gstreamingexecutor.hpp
index b6093ac1ef..40b7872682 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.hpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.hpp
@@ -167,6 +167,9 @@ protected:
     std::vector<ade::NodeHandle> m_emitters;
     std::vector<ade::NodeHandle> m_sinks;
 
+    class Synchronizer;
+    std::unique_ptr<Synchronizer> m_sync;
+
     std::vector<std::thread> m_threads;
     std::vector<stream::SyncQueue>   m_emitter_queues;
 
diff --git a/modules/gapi/src/executor/gtbbexecutor.cpp b/modules/gapi/src/executor/gtbbexecutor.cpp
new file mode 100644
index 0000000000..4966ba114b
--- /dev/null
+++ b/modules/gapi/src/executor/gtbbexecutor.cpp
@@ -0,0 +1,447 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#include "gtbbexecutor.hpp"
+
+#if defined(HAVE_TBB) && (TBB_INTERFACE_VERSION < 12000)
+// TODO: TBB task API has been deprecated and removed in 12000
+
+#include "gapi_itt.hpp"
+
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/util/copy_through_move.hpp>
+#include "logger.hpp" // GAPI_LOG
+
+#include <tbb/task.h>
+#include <memory> // unique_ptr
+
+#include <atomic>
+#include <condition_variable>
+
+#include <chrono>
+
+#define ASSERT(expr)          GAPI_DbgAssert(expr)
+
+#define LOG_INFO(tag, ...)    GAPI_LOG_INFO(tag, __VA_ARGS__)
+#define LOG_WARNING(tag, ...) GAPI_LOG_WARNING(tag, __VA_ARGS__)
+#define LOG_DEBUG(tag, ...)   GAPI_LOG_DEBUG(tag, __VA_ARGS__)
+
+
+#ifdef OPENCV_WITH_ITT
+const __itt_domain* cv::gimpl::parallel::gapi_itt_domain = __itt_domain_create("GAPI Context");
+#endif
+
+namespace cv { namespace gimpl { namespace parallel {
+
+namespace detail {
+// some helper staff to deal with tbb::task related entities
+namespace tasking {
+
+enum class use_tbb_scheduler_bypass {
+   NO,
+   YES
+};
+
+inline void assert_graph_is_running(tbb::task* root) {
+   // tbb::task::wait_for_all block calling thread until task ref_count is dropped to 1
+   // So if the root task ref_count is greater than 1 graph still has a job to do and
+   // according wait_for_all() has not yet returned
+   ASSERT(root->ref_count() > 1);
+}
+
+// made template to break circular dependencies
+template<typename body_t>
+struct functor_task : tbb::task {
+   body_t body;
+
+   template<typename arg_t>
+   functor_task(arg_t&& a) : body(std::forward<arg_t>(a)) {}
+
+   tbb::task * execute() override {
+      assert_graph_is_running(parent());
+
+      auto reuse_current_task = body();
+      // if needed, say TBB to execute current task once again
+      return (use_tbb_scheduler_bypass::YES ==  reuse_current_task) ? (recycle_as_continuation(), this) : nullptr;
+   }
+   ~functor_task() {
+      assert_graph_is_running(parent());
+   }
+};
+
+template<typename body_t>
+auto allocate_task(tbb::task* root, body_t const& body) -> functor_task<body_t>* {
+    return new(tbb::task::allocate_additional_child_of(*root)) functor_task<body_t>{body};
+}
+
+template<typename body_t>
+void spawn_no_assert(tbb::task* root, body_t const& body) {
+   tbb::task::spawn(* allocate_task(root, body));
+}
+
+#ifdef OPENCV_WITH_ITT
+namespace {
+    static __itt_string_handle* ittTbbAddReadyBlocksToQueue   = __itt_string_handle_create("add ready blocks to queue");
+    static __itt_string_handle* ittTbbSpawnReadyBlocks        = __itt_string_handle_create("spawn ready blocks");
+    static __itt_string_handle* ittTbbEnqueueSpawnReadyBlocks = __itt_string_handle_create("enqueueing a spawn of ready blocks");
+    static __itt_string_handle* ittTbbUnlockMasterThread      = __itt_string_handle_create("Unlocking master thread");
+}
+#endif // OPENCV_WITH_ITT
+
+
+template<typename body_t>
+void batch_spawn(size_t count, tbb::task* root, body_t const& body, bool do_assert_graph_is_running = true) {
+   GAPI_ITT_AUTO_TRACE_GUARD(ittTbbSpawnReadyBlocks);
+   if (do_assert_graph_is_running) {
+       assert_graph_is_running(root);
+   }
+
+   for (size_t i=0; i<count; i++) {
+       spawn_no_assert(root, body);
+   }
+}
+
+
+struct destroy_tbb_task {
+    void operator()(tbb::task* t) const { if (t) tbb::task::destroy(*t);};
+};
+
+using root_t = std::unique_ptr<tbb::task, destroy_tbb_task>;
+
+root_t inline create_root(tbb::task_group_context& ctx) {
+    root_t  root{new (tbb::task::allocate_root(ctx)) tbb::empty_task};
+    root->set_ref_count(1); // required by wait_for_all, as it waits until counter drops to 1
+    return root;
+}
+
+std::size_t inline tg_context_traits() {
+    // Specify tbb::task_group_context::concurrent_wait in the traits to ask TBB scheduler not to change
+    // ref_count of the task we wait on (root) when wait is complete.
+    return tbb::task_group_context::default_traits | tbb::task_group_context::concurrent_wait;
+}
+
+} // namespace tasking
+
+namespace async {
+struct async_tasks_t {
+    std::atomic<size_t>         count {0};
+    std::condition_variable     cv;
+    std::mutex                  mtx;
+};
+
+enum class wake_tbb_master {
+   NO,
+   YES
+};
+
+void inline wake_master(async_tasks_t& async_tasks, wake_tbb_master wake_master) {
+    // TODO: seems that this can be relaxed
+    auto active_async_tasks = --async_tasks.count;
+
+    if ((active_async_tasks == 0) || (wake_master == wake_tbb_master::YES)) {
+        // Was the last async task or asked to wake TBB master up(e.g. there are new TBB tasks to execute)
+        GAPI_ITT_AUTO_TRACE_GUARD(ittTbbUnlockMasterThread);
+        // While decrement of async_tasks_t::count is atomic, it might occur after the waiting
+        // thread has read its value but _before_ it actually starts waiting on the condition variable.
+        // So, lock acquire is needed to guarantee that current condition check (if any) in the waiting thread
+        // (possibly ran in parallel to async_tasks_t::count decrement above) is completed _before_ signal is issued.
+        // Therefore when notify_one is called, waiting thread is either sleeping on the condition variable or
+        // running a new check which is guaranteed to pick the new value and return from wait().
+
+        // There is no need to _hold_ the lock while signaling, only to acquire it.
+        std::unique_lock<std::mutex> {async_tasks.mtx};   // Acquire and release the lock.
+        async_tasks.cv.notify_one();
+    }
+}
+
+struct master_thread_sleep_lock_t
+{
+    struct sleep_unlock {
+       void operator()(async_tasks_t* t) const {
+          ASSERT(t);
+          wake_master(*t, wake_tbb_master::NO);
+       }
+    };
+
+    std::unique_ptr<async_tasks_t, sleep_unlock>  guard;
+
+    master_thread_sleep_lock_t() = default;
+    master_thread_sleep_lock_t(async_tasks_t*  async_tasks_ptr) : guard(async_tasks_ptr) {
+        // TODO: seems that this can be relaxed
+        ++(guard->count);
+    }
+
+    void unlock(wake_tbb_master wake) {
+        if (auto* p = guard.release()) {
+            wake_master(*p, wake);
+        }
+    }
+};
+
+master_thread_sleep_lock_t inline lock_sleep_master(async_tasks_t& async_tasks) {
+    return {&async_tasks};
+}
+
+enum class is_tbb_work_present {
+   NO,
+   YES
+};
+
+//RAII object to block TBB master thread (one that does wait_for_all())
+//N.B. :wait_for_all() return control when root ref_count drops to 1,
+struct root_wait_lock_t {
+    struct root_decrement_ref_count{
+        void operator()(tbb::task* t) const {
+            ASSERT(t);
+            auto result = t->decrement_ref_count();
+            ASSERT(result >= 1);
+        }
+    };
+
+    std::unique_ptr<tbb::task, root_decrement_ref_count> guard;
+
+    root_wait_lock_t() = default;
+    root_wait_lock_t(tasking::root_t& root, is_tbb_work_present& previous_state) : guard{root.get()} {
+        // Block the master thread while the *this object is alive.
+        auto new_root_ref_count = root->add_ref_count(1);
+        previous_state = (new_root_ref_count == 2) ? is_tbb_work_present::NO : is_tbb_work_present::YES;
+    }
+
+};
+
+root_wait_lock_t inline lock_wait_master(tasking::root_t& root, is_tbb_work_present& previous_state) {
+    return root_wait_lock_t{root, previous_state};
+}
+
+} // namespace async
+
+inline tile_node*  pop(prio_items_queue_t& q) {
+    tile_node* node = nullptr;
+    bool popped = q.try_pop(node);
+    ASSERT(popped && "queue should be non empty as we push items to it before we spawn");
+    return node;
+}
+
+namespace graph {
+    // Returns : number of items actually pushed into the q
+    std::size_t inline push_ready_dependants(prio_items_queue_t& q, tile_node* node) {
+        GAPI_ITT_AUTO_TRACE_GUARD(ittTbbAddReadyBlocksToQueue);
+        std::size_t ready_items = 0;
+        // enable dependent tasks
+        for (auto* dependant : node->dependants) {
+            // fetch_and_sub returns previous value
+            if (1 == dependant->dependency_count.fetch_sub(1)) {
+                // tile node is ready for execution, add it to the queue
+                q.push(dependant);
+                ++ready_items;
+            }
+        }
+        return ready_items;
+    }
+
+    struct exec_ctx {
+        tbb::task_arena&                arena;
+        prio_items_queue_t&             q;
+        tbb::task_group_context         tg_ctx;
+        tasking::root_t                 root;
+        detail::async::async_tasks_t    async_tasks;
+        std::atomic<size_t>             executed {0};
+
+        exec_ctx(tbb::task_arena& arena_, prio_items_queue_t& q_)
+            : arena(arena_), q(q_),
+              // As the traits is last argument, explicitly specify (default) value for first argument
+              tg_ctx{tbb::task_group_context::bound, tasking::tg_context_traits()},
+              root(tasking::create_root(tg_ctx))
+        {}
+    };
+
+    // At the moment there are no suitable tools to  manage TBB priorities on task by task basis.
+    // Instead priority queue is used to respect tile_node priorities.
+    // As well, TBB task is not bound to any particular tile_node until actually executed.
+
+    // Strictly speaking there are two graphs here:
+    // - G-API one, described by the connected tile_node instances.
+    //   This graph is :
+    //    - Known beforehand, and do not change during the execution (i.e. static)
+    //    - Contains both TBB non-TBB parts
+    //    - prioritized, (i.e. all nodes has assigned priority of execution)
+    //
+    // - TBB task tree, which is :
+    //    - flat (Has only two levels : root and leaves)
+    //    - dynamic, i.e. new leaves are added on demand when new tbb tasks are spawned
+    //    - describes only TBB/CPU part of the whole graph
+    //    - non-prioritized (i.e. all tasks are created equal)
+
+    // Class below represents TBB task payload.
+    //
+    // Each instance basically does the three things :
+    // 1. Gets the tile_node item from the top of the queue
+    // 2. Executes its body
+    // 3. Pushes dependent tile_nodes to the queue once they are ready
+    //
+    struct task_body {
+        exec_ctx& ctx;
+
+        std::size_t push_ready_dependants(tile_node* node) const {
+            return graph::push_ready_dependants(ctx.q, node);
+        }
+
+        void spawn_clones(std::size_t items) const {
+            tasking::batch_spawn(items, ctx.root.get(), *this);
+        }
+
+        task_body(exec_ctx& ctx_) : ctx(ctx_) {}
+        tasking::use_tbb_scheduler_bypass operator()() const {
+            ASSERT(!ctx.q.empty() && "Spawned task with no job to do ? ");
+
+            tile_node* node = detail::pop(ctx.q);
+
+            auto result = tasking::use_tbb_scheduler_bypass::NO;
+            // execute the task
+
+            if (auto p = util::get_if<tile_node::sync_task_body>(&(node->task_body))) {
+                // synchronous task
+                p->body();
+
+                std::size_t ready_items = push_ready_dependants(node);
+
+                if (ready_items > 0) {
+                    // spawn one less tasks and say TBB to reuse(recycle) current task
+                    spawn_clones(ready_items - 1);
+                    result = tasking::use_tbb_scheduler_bypass::YES;
+                }
+            }
+            else {
+                LOG_DEBUG(NULL, "Async task");
+                using namespace detail::async;
+                using util::copy_through_move;
+
+                auto block_master = copy_through_move(lock_sleep_master(ctx.async_tasks));
+
+                auto self_copy = *this;
+                auto callback = [node, block_master, self_copy] () mutable /*due to block_master.get().unlock()*/ {
+                    LOG_DEBUG(NULL, "Async task callback is called");
+                    // Implicitly unlock master right in the end of callback
+                    auto master_sleep_lock = std::move(block_master);
+                    std::size_t ready_items = self_copy.push_ready_dependants(node);
+                    if (ready_items > 0) {
+                        auto master_was_active = is_tbb_work_present::NO;
+                        {
+                            GAPI_ITT_AUTO_TRACE_GUARD(ittTbbEnqueueSpawnReadyBlocks);
+                            // Force master thread (one that does wait_for_all()) to (actively) wait for enqueued tasks
+                            // and unlock it right after all dependent tasks are spawned.
+
+                            auto root_wait_lock = copy_through_move(lock_wait_master(self_copy.ctx.root, master_was_active));
+
+                            // TODO: add test to cover proper holding of root_wait_lock
+                            // As the calling thread most likely is not TBB one, instead of spawning TBB tasks directly we
+                            // enqueue a task which will spawn them.
+                            // For master thread to not leave wait_for_all() prematurely,
+                            // hold the root_wait_lock until need tasks are actually spawned.
+                            self_copy.ctx.arena.enqueue([ready_items, self_copy, root_wait_lock]() {
+                                self_copy.spawn_clones(ready_items);
+                                // TODO: why we need this? Either write a descriptive comment or remove it
+                                volatile auto unused = root_wait_lock.get().guard.get();
+                                util::suppress_unused_warning(unused);
+                            });
+                        }
+                        // Wake master thread (if any) to pick up the enqueued tasks iff:
+                        // 1. there is new TBB work to do, and
+                        // 2. Master thread was sleeping on condition variable waiting for async tasks to complete
+                        //   (There was no active work before (i.e. root->ref_count() was == 1))
+                        auto wake_master = (master_was_active == is_tbb_work_present::NO) ?
+                                wake_tbb_master::YES : wake_tbb_master::NO;
+                        master_sleep_lock.get().unlock(wake_master);
+                    }
+                };
+
+                auto& body = util::get<tile_node::async_task_body>(node->task_body).body;
+                body(std::move(callback), node->total_order_index);
+            }
+
+            ctx.executed++;
+            // reset dependecy_count to initial state to simplify re-execution of the same graph
+            node->dependency_count = node->dependencies;
+
+            return result;
+        }
+    };
+}
+} // namespace detail
+}}}  // namespace cv::gimpl::parallel
+
+void cv::gimpl::parallel::execute(prio_items_queue_t& q) {
+    // get the reference to current task_arena (i.e. one we are running in)
+#if TBB_INTERFACE_VERSION > 9002
+    using attach_t = tbb::task_arena::attach;
+#else
+    using attach_t = tbb::internal::attach;
+#endif
+
+    tbb::task_arena arena{attach_t{}};
+    execute(q, arena);
+}
+
+void cv::gimpl::parallel::execute(prio_items_queue_t& q, tbb::task_arena& arena) {
+    using namespace detail;
+    graph::exec_ctx ctx{arena, q};
+
+    arena.execute(
+        [&]() {
+            // Passed in queue is assumed to contain starting tasks, i.e. ones with no (or resolved) dependencies
+            auto num_start_tasks = q.size();
+
+            // TODO: use recursive spawning and task soft affinity for faster task distribution
+            // As graph is starting and no task has been spawned yet
+            // assert_graph_is_running(root) will not hold, so spawn without assert
+            tasking::batch_spawn(num_start_tasks, ctx.root.get(), graph::task_body{ctx}, /* assert_graph_is_running*/false);
+
+            using namespace std::chrono;
+            high_resolution_clock timer;
+
+            auto tbb_work_done   = [&ctx]() { return 1 == ctx.root->ref_count(); };
+            auto async_work_done = [&ctx]() { return 0 == ctx.async_tasks.count; };
+            do {
+               // First participate in execution of TBB graph till there are no more ready tasks.
+               ctx.root->wait_for_all();
+
+               if (!async_work_done()) { // Wait on the conditional variable iff there is active async work
+                   auto start = timer.now();
+                   std::unique_lock<std::mutex> lk(ctx.async_tasks.mtx);
+                   // Wait (probably by sleeping) until all async tasks are completed or new TBB tasks are created.
+                   // FIXME: Use TBB resumable tasks here to avoid blocking TBB thread
+                   ctx.async_tasks.cv.wait(lk, [&]{return async_work_done() || !tbb_work_done() ;});
+
+                   LOG_INFO(NULL, "Slept for " << duration_cast<milliseconds>(timer.now() - start).count() << " ms \n");
+               }
+            }
+            while(!tbb_work_done() || !async_work_done());
+
+            ASSERT(tbb_work_done() && async_work_done() && "Graph is still running?");
+        }
+    );
+
+    LOG_INFO(NULL, "Done. Executed " << ctx.executed << " tasks");
+}
+
+std::ostream& cv::gimpl::parallel::operator<<(std::ostream& o, tile_node const& n) {
+    o << "("
+            << " at:"    << &n << ","
+            << "indx: "  << n.total_order_index << ","
+            << "deps #:" << n.dependency_count.value << ", "
+            << "prods:"  << n.dependants.size();
+
+    o << "[";
+    for (auto* d: n.dependants) {
+        o << d << ",";
+    }
+    o << "]";
+
+    o << ")";
+    return o;
+}
+
+#endif // HAVE_TBB && TBB_INTERFACE_VERSION
diff --git a/modules/gapi/src/executor/gtbbexecutor.hpp b/modules/gapi/src/executor/gtbbexecutor.hpp
new file mode 100644
index 0000000000..7027792c23
--- /dev/null
+++ b/modules/gapi/src/executor/gtbbexecutor.hpp
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_TBB_EXECUTOR_HPP
+#define OPENCV_GAPI_TBB_EXECUTOR_HPP
+
+#if !defined(GAPI_STANDALONE)
+#include <opencv2/cvconfig.h>
+#endif
+
+#ifdef HAVE_TBB
+#ifndef TBB_SUPPRESS_DEPRECATED_MESSAGES  // supress warning
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#endif
+#include <tbb/tbb.h>
+#include <tbb/task.h>
+#if TBB_INTERFACE_VERSION < 12000
+// TODO: TBB task API has been deprecated and removed in 12000
+
+#include <atomic>
+#include <vector>
+#include <functional>
+#include <iosfwd>
+
+#include <tbb/concurrent_priority_queue.h>
+#include <tbb/task_arena.h>
+
+#include <opencv2/gapi/util/variant.hpp>
+
+namespace cv { namespace gimpl { namespace parallel {
+
+// simple wrapper to allow copies of std::atomic
+template<typename  count_t>
+struct atomic_copyable_wrapper {
+    std::atomic<count_t> value;
+
+    atomic_copyable_wrapper(count_t val) : value(val) {}
+    atomic_copyable_wrapper(atomic_copyable_wrapper const& lhs) : value (lhs.value.load(std::memory_order_relaxed)) {}
+
+    atomic_copyable_wrapper& operator=(count_t val) {
+        value.store(val, std::memory_order_relaxed);
+        return *this;
+    }
+
+    count_t fetch_sub(count_t val) {
+        return value.fetch_sub(val);
+    }
+
+    count_t fetch_add(count_t val) {
+        return value.fetch_add(val);
+    }
+};
+
+struct async_tag {};
+constexpr async_tag async;
+
+// Class describing a piece of work in the node in the tasks graph.
+// Most of the fields are set only once during graph compilation and never changes.
+// (However at the moment they can not be made const due to two phase initialization
+// of the tile_node objects)
+// FIXME: refactor the code to make the const?
+struct tile_node {
+    // place in totally ordered queue of tasks to execute. Inverse to priority, i.e.
+    // lower index means higher priority
+    size_t                                          total_order_index = 0;
+
+    // FIXME: use templates here instead of std::function
+    struct sync_task_body {
+        std::function<void()> body;
+    };
+    struct async_task_body {
+        std::function<void(std::function<void()>&& callback, size_t total_order_index)> body;
+    };
+
+    util::variant<sync_task_body, async_task_body>  task_body;
+
+    // number of dependencies according to a dependency graph (i.e. number of "input" edges).
+    size_t                                          dependencies     = 0;
+
+    // number of unsatisfied dependencies. When drops to zero task is ready for execution.
+    // Initially equal to "dependencies"
+    atomic_copyable_wrapper<size_t>                 dependency_count = 0;
+
+    std::vector<tile_node*>                         dependants;
+
+    tile_node(decltype(sync_task_body::body)&& f) : task_body(sync_task_body{std::move(f)}) {};
+    tile_node(async_tag, decltype(async_task_body::body)&& f) : task_body(async_task_body{std::move(f)}) {};
+};
+
+std::ostream& operator<<(std::ostream& o, tile_node const& n);
+
+struct tile_node_indirect_priority_comparator {
+    bool operator()(tile_node const * lhs, tile_node const * rhs) const {
+        return lhs->total_order_index > rhs->total_order_index;
+    }
+};
+
+using prio_items_queue_t = tbb::concurrent_priority_queue<tile_node*, tile_node_indirect_priority_comparator>;
+
+void execute(prio_items_queue_t& q);
+void execute(prio_items_queue_t& q, tbb::task_arena& arena);
+
+}}} // namespace cv::gimpl::parallel
+
+#endif // TBB_INTERFACE_VERSION
+#endif // HAVE_TBB
+
+#endif // OPENCV_GAPI_TBB_EXECUTOR_HPP
diff --git a/modules/gapi/test/common/gapi_core_tests.hpp b/modules/gapi/test/common/gapi_core_tests.hpp
index 889e32f1c1..e87828200e 100644
--- a/modules/gapi/test/common/gapi_core_tests.hpp
+++ b/modules/gapi/test/common/gapi_core_tests.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_CORE_TESTS_HPP
@@ -151,6 +151,9 @@ GAPI_TEST_FIXTURE(WarpPerspectiveTest, initMatrixRandU,
 GAPI_TEST_FIXTURE(WarpAffineTest, initMatrixRandU,
         FIXTURE_API(CompareMats, double , double, int, int, cv::Scalar),
         6, cmpF, angle, scale, flags, border_mode, border_value)
+GAPI_TEST_FIXTURE(KMeansNDTest, initMatrixRandU, FIXTURE_API(CompareMats, int, cv::KmeansFlags), 3, cmpF, K, flags)
+GAPI_TEST_FIXTURE(KMeans2DTest, initNothing,     FIXTURE_API(int, cv::KmeansFlags), 2, K, flags)
+GAPI_TEST_FIXTURE(KMeans3DTest, initNothing,     FIXTURE_API(int, cv::KmeansFlags), 2, K, flags)
 
 GAPI_TEST_EXT_BASE_FIXTURE(ParseSSDBLTest, ParserSSDTest, initNothing,
     FIXTURE_API(float, int), 2, confidence_threshold, filter_label)
@@ -160,6 +163,7 @@ GAPI_TEST_EXT_BASE_FIXTURE(ParseYoloTest, ParserYoloTest, initNothing,
     FIXTURE_API(float, float, int, std::pair<bool,int>), 4, confidence_threshold, nms_threshold, num_classes, dims_config)
 GAPI_TEST_FIXTURE(SizeTest, initMatrixRandU, <>, 0)
 GAPI_TEST_FIXTURE(SizeRTest, initNothing, <>, 0)
+GAPI_TEST_FIXTURE(SizeMFTest, initNothing, <>, 0)
 } // opencv_test
 
 #endif //OPENCV_GAPI_CORE_TESTS_HPP
diff --git a/modules/gapi/test/common/gapi_core_tests_common.hpp b/modules/gapi/test/common/gapi_core_tests_common.hpp
new file mode 100644
index 0000000000..faaa2f7061
--- /dev/null
+++ b/modules/gapi/test/common/gapi_core_tests_common.hpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_CORE_TESTS_COMMON_HPP
+#define OPENCV_GAPI_CORE_TESTS_COMMON_HPP
+
+#include "gapi_tests_common.hpp"
+#include "../../include/opencv2/gapi/core.hpp"
+
+#include <opencv2/core.hpp>
+
+namespace opencv_test
+{
+namespace
+{
+template <typename Elem, typename CmpF>
+inline bool compareKMeansOutputs(const std::vector<Elem>& outGAPI,
+                                 const std::vector<Elem>& outOCV,
+                                 const CmpF& = AbsExact().to_compare_obj())
+{
+    return AbsExactVector<Elem>().to_compare_f()(outGAPI, outOCV);
+}
+
+inline bool compareKMeansOutputs(const cv::Mat& outGAPI,
+                                 const cv::Mat& outOCV,
+                                 const CompareMats& cmpF)
+{
+    return cmpF(outGAPI, outOCV);
+}
+}
+
+// Overload with initializing the labels
+template<typename Labels, typename In>
+cv::GComputation kmeansTestGAPI(const In& in, const Labels& bestLabels, const int K,
+                                const cv::KmeansFlags flags, cv::GCompileArgs&& args,
+                                double& compact_gapi, Labels& labels_gapi, In& centers_gapi)
+{
+    const cv::TermCriteria criteria(cv::TermCriteria::MAX_ITER + cv::TermCriteria::EPS, 30, 0);
+    const int attempts = 1;
+
+    cv::detail::g_type_of_t<In> gIn, centers;
+    cv::GOpaque<double> compactness;
+    cv::detail::g_type_of_t<Labels> inLabels, outLabels;
+    std::tie(compactness, outLabels, centers) =
+        cv::gapi::kmeans(gIn, K, inLabels, criteria, attempts, flags);
+    cv::GComputation c(cv::GIn(gIn, inLabels), cv::GOut(compactness, outLabels, centers));
+    c.apply(cv::gin(in, bestLabels), cv::gout(compact_gapi, labels_gapi, centers_gapi),
+            std::move(args));
+    return c;
+}
+
+// Overload for vector<Point> tests w/o initializing the labels
+template<typename Pt>
+cv::GComputation kmeansTestGAPI(const std::vector<Pt>& in, const int K,
+                                const cv::KmeansFlags flags, cv::GCompileArgs&& args,
+                                double& compact_gapi, std::vector<int>& labels_gapi,
+                                std::vector<Pt>& centers_gapi)
+{
+    const cv::TermCriteria criteria(cv::TermCriteria::MAX_ITER + cv::TermCriteria::EPS, 30, 0);
+    const int attempts = 1;
+
+    cv::GArray<Pt> gIn, centers;
+    cv::GOpaque<double> compactness;
+    cv::GArray<int> inLabels(std::vector<int>{}), outLabels;
+    std::tie(compactness, outLabels, centers) =
+        cv::gapi::kmeans(gIn, K, inLabels, criteria, attempts, flags);
+    cv::GComputation c(cv::GIn(gIn), cv::GOut(compactness, outLabels, centers));
+    c.apply(cv::gin(in), cv::gout(compact_gapi, labels_gapi, centers_gapi), std::move(args));
+    return c;
+}
+
+// Overload for Mat tests w/o initializing the labels
+static cv::GComputation kmeansTestGAPI(const cv::Mat& in, const int K,
+                                       const cv::KmeansFlags flags, cv::GCompileArgs&& args,
+                                       double& compact_gapi, cv::Mat& labels_gapi,
+                                       cv::Mat& centers_gapi)
+{
+    const cv::TermCriteria criteria(cv::TermCriteria::MAX_ITER + cv::TermCriteria::EPS, 30, 0);
+    const int attempts = 1;
+
+    cv::GMat gIn, centers, labels;
+    cv::GOpaque<double> compactness;
+    std::tie(compactness, labels, centers) = cv::gapi::kmeans(gIn, K, criteria, attempts, flags);
+    cv::GComputation c(cv::GIn(gIn), cv::GOut(compactness, labels, centers));
+    c.apply(cv::gin(in), cv::gout(compact_gapi, labels_gapi, centers_gapi), std::move(args));
+    return c;
+}
+
+template<typename Pt>
+void kmeansTestValidate(const cv::Size& sz, const MatType2&, const int K,
+                        const double compact_gapi, const std::vector<int>& labels_gapi,
+                        const std::vector<Pt>& centers_gapi)
+{
+    const int amount = sz.height;
+    // Validation
+    EXPECT_GE(compact_gapi, 0.);
+    EXPECT_EQ(labels_gapi.size(), static_cast<size_t>(amount));
+    EXPECT_EQ(centers_gapi.size(), static_cast<size_t>(K));
+}
+
+static void kmeansTestValidate(const cv::Size& sz, const MatType2& type, const int K,
+                               const double compact_gapi, const cv::Mat& labels_gapi,
+                               const cv::Mat& centers_gapi)
+{
+    const int chan   = (type >> CV_CN_SHIFT) + 1;
+    const int amount = sz.height != 1 ? sz.height : sz.width;
+    const int dim    = sz.height != 1 ? sz.width * chan : chan;
+    // Validation
+    EXPECT_GE(compact_gapi, 0.);
+    EXPECT_FALSE(labels_gapi.empty());
+    EXPECT_FALSE(centers_gapi.empty());
+    EXPECT_EQ(labels_gapi.rows, amount);
+    EXPECT_EQ(labels_gapi.cols, 1);
+    EXPECT_EQ(centers_gapi.rows, K);
+    EXPECT_EQ(centers_gapi.cols, dim);
+}
+
+template<typename Labels, typename In>
+void kmeansTestOpenCVCompare(const In& in, const Labels& bestLabels, const int K,
+                             const cv::KmeansFlags flags, const double compact_gapi,
+                             const Labels& labels_gapi, const In& centers_gapi,
+                             const CompareMats& cmpF = AbsExact().to_compare_obj())
+{
+    const cv::TermCriteria criteria(cv::TermCriteria::MAX_ITER + cv::TermCriteria::EPS, 30, 0);
+    const int attempts = 1;
+    Labels labels_ocv;
+    In centers_ocv;
+    { // step to generalize cv::Mat & std::vector cases of bestLabels' types
+        cv::Mat bestLabelsMat(bestLabels);
+        bestLabelsMat.copyTo(labels_ocv);
+    }
+    // OpenCV code /////////////////////////////////////////////////////////////
+    double compact_ocv = cv::kmeans(in, K, labels_ocv, criteria, attempts, flags, centers_ocv);
+    // Comparison //////////////////////////////////////////////////////////////
+    EXPECT_TRUE(compact_gapi == compact_ocv);
+    EXPECT_TRUE(compareKMeansOutputs(labels_gapi, labels_ocv, cmpF));
+    EXPECT_TRUE(compareKMeansOutputs(centers_gapi, centers_ocv, cmpF));
+}
+
+// If an input type is cv::Mat, labels' type is also cv::Mat;
+// in other cases, their type has to be std::vector<int>
+template<typename In>
+using KMeansLabelType = typename std::conditional<std::is_same<In, cv::Mat>::value,
+                                                  cv::Mat,
+                                                  std::vector<int>
+                                                 >::type;
+template<typename In, typename Labels = KMeansLabelType<In> >
+void kmeansTestBody(const In& in, const cv::Size& sz, const MatType2& type, const int K,
+                    const cv::KmeansFlags flags, cv::GCompileArgs&& args,
+                    const CompareMats& cmpF = AbsExact().to_compare_obj())
+{
+    double compact_gapi = -1.;
+    Labels labels_gapi;
+    In centers_gapi;
+    if (flags & cv::KMEANS_USE_INITIAL_LABELS)
+    {
+        Labels bestLabels;
+        { // step to generalize cv::Mat & std::vector cases of bestLabels' types
+            const int amount = (sz.height != 1 || sz.width == -1) ? sz.height : sz.width;
+            cv::Mat bestLabelsMat(cv::Size{1, amount}, CV_32SC1);
+            cv::randu(bestLabelsMat, 0, K);
+            bestLabelsMat.copyTo(bestLabels);
+        }
+        kmeansTestGAPI(in, bestLabels, K, flags, std::move(args), compact_gapi, labels_gapi,
+                       centers_gapi);
+        kmeansTestOpenCVCompare(in, bestLabels, K, flags, compact_gapi, labels_gapi,
+                                centers_gapi, cmpF);
+    }
+    else
+    {
+        kmeansTestGAPI(in, K, flags, std::move(args), compact_gapi, labels_gapi, centers_gapi);
+        kmeansTestValidate(sz, type, K, compact_gapi, labels_gapi, centers_gapi);
+    }
+}
+} // namespace opencv_test
+
+#endif // OPENCV_GAPI_CORE_TESTS_COMMON_HPP
diff --git a/modules/gapi/test/common/gapi_core_tests_inl.hpp b/modules/gapi/test/common/gapi_core_tests_inl.hpp
index 045b556369..d4760e804e 100644
--- a/modules/gapi/test/common/gapi_core_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_core_tests_inl.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_CORE_TESTS_INL_HPP
@@ -12,9 +12,10 @@
 #include <opencv2/gapi/infer/parsers.hpp>
 #include "gapi_core_tests.hpp"
 
+#include "gapi_core_tests_common.hpp"
+
 namespace opencv_test
 {
-
 TEST_P(MathOpTest, MatricesAccuracyTest)
 {
     // G-API code & corresponding OpenCV code ////////////////////////////////
@@ -1287,7 +1288,11 @@ TEST_P(PhaseTest, AccuracyTest)
     // Comparison //////////////////////////////////////////////////////////////
     // FIXME: use a comparison functor instead (after enabling OpenCL)
     {
+#if defined(__aarch64__) || defined(__arm__)
+        EXPECT_NEAR(0, cvtest::norm(out_mat_ocv, out_mat_gapi, NORM_INF), 4e-6);
+#else
         EXPECT_EQ(0, cvtest::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+#endif
     }
 }
 
@@ -1377,6 +1382,27 @@ TEST_P(NormalizeTest, Test)
     }
 }
 
+TEST_P(KMeansNDTest, AccuracyTest)
+{
+    kmeansTestBody(in_mat1, sz, type, K, flags, getCompileArgs(), cmpF);
+}
+
+TEST_P(KMeans2DTest, AccuracyTest)
+{
+    const int amount = sz.height;
+    std::vector<cv::Point2f> in_vector{};
+    initPointsVectorRandU(amount, in_vector);
+    kmeansTestBody(in_vector, sz, type, K, flags, getCompileArgs());
+}
+
+TEST_P(KMeans3DTest, AccuracyTest)
+{
+    const int amount = sz.height;
+    std::vector<cv::Point3f> in_vector{};
+    initPointsVectorRandU(amount, in_vector);
+    kmeansTestBody(in_vector, sz, type, K, flags, getCompileArgs());
+}
+
 // PLEASE DO NOT PUT NEW ACCURACY TESTS BELOW THIS POINT! //////////////////////
 
 TEST_P(BackendOutputAllocationTest, EmptyOutput)
@@ -1711,6 +1737,39 @@ TEST_P(SizeRTest, ParseTest)
     EXPECT_EQ(out_sz, sz);
 }
 
+namespace {
+    class TestMediaBGR final : public cv::MediaFrame::IAdapter {
+        cv::Mat m_mat;
+
+    public:
+        explicit TestMediaBGR(cv::Mat m)
+            : m_mat(m) {
+        }
+        cv::GFrameDesc meta() const override {
+            return cv::GFrameDesc{ cv::MediaFormat::BGR, cv::Size(m_mat.cols, m_mat.rows) };
+        }
+        cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+            cv::MediaFrame::View::Ptrs pp = { m_mat.ptr(), nullptr, nullptr, nullptr };
+            cv::MediaFrame::View::Strides ss = { m_mat.step, 0u, 0u, 0u };
+            return cv::MediaFrame::View(std::move(pp), std::move(ss));
+        }
+    };
+};
+
+TEST_P(SizeMFTest, ParseTest)
+{
+    cv::Size out_sz;
+    cv::Mat bgr = cv::Mat::eye(sz.height, sz.width, CV_8UC3);
+    cv::MediaFrame frame = cv::MediaFrame::Create<TestMediaBGR>(bgr);
+
+    cv::GFrame in;
+    auto out = cv::gapi::streaming::size(in);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+    c.apply(cv::gin(frame), cv::gout(out_sz), getCompileArgs());
+
+    EXPECT_EQ(out_sz, sz);
+}
+
 } // opencv_test
 
 #endif //OPENCV_GAPI_CORE_TESTS_INL_HPP
diff --git a/modules/gapi/test/common/gapi_imgproc_tests.hpp b/modules/gapi/test/common/gapi_imgproc_tests.hpp
index b48b7b6732..97d46943d5 100644
--- a/modules/gapi/test/common/gapi_imgproc_tests.hpp
+++ b/modules/gapi/test/common/gapi_imgproc_tests.hpp
@@ -46,7 +46,7 @@ GAPI_TEST_FIXTURE(Erode3x3Test, initMatrixRandN, FIXTURE_API(CompareMats,int), 2
 GAPI_TEST_FIXTURE(DilateTest, initMatrixRandN, FIXTURE_API(CompareMats,int,int), 3,
     cmpF, kernSize, kernType)
 GAPI_TEST_FIXTURE(Dilate3x3Test, initMatrixRandN, FIXTURE_API(CompareMats,int), 2, cmpF, numIters)
-GAPI_TEST_FIXTURE(MorphologyExTest, initMatrixRandN, FIXTURE_API(CompareMats,MorphTypes),
+GAPI_TEST_FIXTURE(MorphologyExTest, initMatrixRandN, FIXTURE_API(CompareMats,cv::MorphTypes),
                   2, cmpF, op)
 GAPI_TEST_FIXTURE(SobelTest, initMatrixRandN, FIXTURE_API(CompareMats,int,int,int), 4,
     cmpF, kernSize, dx, dy)
@@ -68,21 +68,18 @@ GAPI_TEST_FIXTURE_SPEC_PARAMS(GoodFeaturesTest,
                               blockSize, useHarrisDetector)
 GAPI_TEST_FIXTURE_SPEC_PARAMS(FindContoursNoOffsetTest,
                               FIXTURE_API(cv::Size,MatType2,cv::RetrievalModes,
-                                          cv::ContourApproximationModes),
-                              4, sz, type, mode, method)
+                                          cv::ContourApproximationModes, CompareMats),
+                              5, sz, type, mode, method, cmpF)
 GAPI_TEST_FIXTURE_SPEC_PARAMS(FindContoursOffsetTest, <>, 0)
 GAPI_TEST_FIXTURE_SPEC_PARAMS(FindContoursHNoOffsetTest,
                               FIXTURE_API(cv::Size,MatType2,cv::RetrievalModes,
-                                          cv::ContourApproximationModes),
-                              4, sz, type, mode, method)
+                                          cv::ContourApproximationModes, CompareMats),
+                              5, sz, type, mode, method, cmpF)
 GAPI_TEST_FIXTURE_SPEC_PARAMS(FindContoursHOffsetTest, <>, 0)
-GAPI_TEST_FIXTURE(BoundingRectMatTest, initMatrixRandU, FIXTURE_API(CompareRects), 1, cmpF)
-GAPI_TEST_FIXTURE(BoundingRectMatVector32STest, initNothing, FIXTURE_API(CompareRects), 1, cmpF)
-GAPI_TEST_FIXTURE(BoundingRectMatVector32FTest, initNothing, FIXTURE_API(CompareRects), 1, cmpF)
+GAPI_TEST_FIXTURE(BoundingRectMatTest, initNothing, FIXTURE_API(CompareRects,bool),
+                  2, cmpF, initByVector)
 GAPI_TEST_FIXTURE(BoundingRectVector32STest, initNothing, FIXTURE_API(CompareRects), 1, cmpF)
 GAPI_TEST_FIXTURE(BoundingRectVector32FTest, initNothing, FIXTURE_API(CompareRects), 1, cmpF)
-GAPI_TEST_FIXTURE(BGR2RGBTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
-GAPI_TEST_FIXTURE(RGB2GrayTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
 GAPI_TEST_FIXTURE(FitLine2DMatVectorTest, initMatByPointsVectorRandU<cv::Point_>,
                   FIXTURE_API(CompareVecs<float, 4>,cv::DistanceTypes), 2, cmpF, distType)
 GAPI_TEST_FIXTURE(FitLine2DVector32STest, initNothing,
@@ -99,6 +96,8 @@ GAPI_TEST_FIXTURE(FitLine3DVector32FTest, initNothing,
                   FIXTURE_API(CompareVecs<float, 6>,cv::DistanceTypes), 2, cmpF, distType)
 GAPI_TEST_FIXTURE(FitLine3DVector64FTest, initNothing,
                   FIXTURE_API(CompareVecs<float, 6>,cv::DistanceTypes), 2, cmpF, distType)
+GAPI_TEST_FIXTURE(BGR2RGBTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
+GAPI_TEST_FIXTURE(RGB2GrayTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
 GAPI_TEST_FIXTURE(BGR2GrayTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
 GAPI_TEST_FIXTURE(RGB2YUVTest, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
 GAPI_TEST_FIXTURE(BGR2I420Test, initMatrixRandN, FIXTURE_API(CompareMats), 1, cmpF)
diff --git a/modules/gapi/test/common/gapi_imgproc_tests_common.hpp b/modules/gapi/test/common/gapi_imgproc_tests_common.hpp
new file mode 100644
index 0000000000..a5b9eeee4b
--- /dev/null
+++ b/modules/gapi/test/common/gapi_imgproc_tests_common.hpp
@@ -0,0 +1,197 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_IMGPROC_TESTS_COMMON_HPP
+#define OPENCV_GAPI_IMGPROC_TESTS_COMMON_HPP
+
+#include "gapi_tests_common.hpp"
+#include "../../include/opencv2/gapi/imgproc.hpp"
+
+#include <opencv2/imgproc.hpp>
+
+namespace opencv_test
+{
+// Draw random ellipses on given cv::Mat of given size and type
+static void initMatForFindingContours(cv::Mat& mat, const cv::Size& sz, const int type)
+{
+    cv::RNG& rng = theRNG();
+    mat = cv::Mat(sz, type, cv::Scalar::all(0));
+    const size_t numEllipses = rng.uniform(1, 10);
+
+    for( size_t i = 0; i < numEllipses; i++ )
+    {
+        cv::Point center;
+        cv::Size  axes;
+        center.x    = rng.uniform(0, sz.width);
+        center.y    = rng.uniform(0, sz.height);
+        axes.width  = rng.uniform(2, sz.width);
+        axes.height = rng.uniform(2, sz.height);
+        const int    color = rng.uniform(1, 256);
+        const double angle = rng.uniform(0., 180.);
+        cv::ellipse(mat, center, axes, angle, 0., 360., color, 1, FILLED);
+    }
+}
+
+enum OptionalFindContoursOutput {NONE, HIERARCHY};
+
+template<OptionalFindContoursOutput optional = NONE>
+cv::GComputation findContoursTestGAPI(const cv::Mat& in, const cv::RetrievalModes mode,
+                                      const cv::ContourApproximationModes method,
+                                      cv::GCompileArgs&& args,
+                                      std::vector<std::vector<cv::Point>>& out_cnts_gapi,
+                                      std::vector<cv::Vec4i>& /*out_hier_gapi*/,
+                                      const cv::Point& offset = cv::Point())
+{
+    cv::GMat g_in;
+    cv::GOpaque<cv::Point> gOffset;
+    cv::GArray<cv::GArray<cv::Point>> outCts;
+    outCts = cv::gapi::findContours(g_in, mode, method, gOffset);
+    cv::GComputation c(GIn(g_in, gOffset), GOut(outCts));
+    c.apply(gin(in, offset), gout(out_cnts_gapi), std::move(args));
+    return c;
+}
+
+template<> cv::GComputation findContoursTestGAPI<HIERARCHY> (
+    const cv::Mat& in, const cv::RetrievalModes mode, const cv::ContourApproximationModes method,
+    cv::GCompileArgs&& args, std::vector<std::vector<cv::Point>>& out_cnts_gapi,
+    std::vector<cv::Vec4i>& out_hier_gapi, const cv::Point& offset)
+{
+    cv::GMat g_in;
+    cv::GOpaque<cv::Point> gOffset;
+    cv::GArray<cv::GArray<cv::Point>> outCts;
+    cv::GArray<cv::Vec4i> outHier;
+    std::tie(outCts, outHier) = cv::gapi::findContoursH(g_in, mode, method, gOffset);
+    cv::GComputation c(GIn(g_in, gOffset), GOut(outCts, outHier));
+    c.apply(gin(in, offset), gout(out_cnts_gapi, out_hier_gapi), std::move(args));
+    return c;
+}
+
+template<OptionalFindContoursOutput optional = NONE>
+void findContoursTestOpenCVCompare(const cv::Mat& in, const cv::RetrievalModes mode,
+                                   const cv::ContourApproximationModes method,
+                                   const std::vector<std::vector<cv::Point>>& out_cnts_gapi,
+                                   const std::vector<cv::Vec4i>&              out_hier_gapi,
+                                   const CompareMats& cmpF, const cv::Point& offset = cv::Point())
+{
+    // OpenCV code /////////////////////////////////////////////////////////////
+    std::vector<std::vector<cv::Point>> out_cnts_ocv;
+    std::vector<cv::Vec4i>              out_hier_ocv;
+    cv::findContours(in, out_cnts_ocv, out_hier_ocv, mode, method, offset);
+    // Comparison //////////////////////////////////////////////////////////////
+    EXPECT_TRUE(out_cnts_gapi.size() == out_cnts_ocv.size());
+
+    cv::Mat out_mat_ocv  = cv::Mat(cv::Size{ in.cols, in.rows }, in.type(), cv::Scalar::all(0));
+    cv::Mat out_mat_gapi = cv::Mat(cv::Size{ in.cols, in.rows }, in.type(), cv::Scalar::all(0));
+    cv::fillPoly(out_mat_ocv,  out_cnts_ocv,  cv::Scalar::all(1));
+    cv::fillPoly(out_mat_gapi, out_cnts_gapi, cv::Scalar::all(1));
+    EXPECT_TRUE(cmpF(out_mat_ocv, out_mat_gapi));
+    if (optional == HIERARCHY)
+    {
+        EXPECT_TRUE(out_hier_ocv.size() == out_hier_gapi.size());
+        EXPECT_TRUE(AbsExactVector<cv::Vec4i>().to_compare_f()(out_hier_ocv, out_hier_gapi));
+    }
+}
+
+template<OptionalFindContoursOutput optional = NONE>
+void findContoursTestBody(const cv::Size& sz, const MatType2& type, const cv::RetrievalModes mode,
+                          const cv::ContourApproximationModes method, const CompareMats& cmpF,
+                          cv::GCompileArgs&& args, const cv::Point& offset = cv::Point())
+{
+    cv::Mat in;
+    initMatForFindingContours(in, sz, type);
+
+    std::vector<std::vector<cv::Point>> out_cnts_gapi;
+    std::vector<cv::Vec4i>              out_hier_gapi;
+    findContoursTestGAPI<optional>(in, mode, method, std::move(args), out_cnts_gapi, out_hier_gapi,
+                                   offset);
+    findContoursTestOpenCVCompare<optional>(in, mode, method, out_cnts_gapi, out_hier_gapi, cmpF,
+                                            offset);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename In>
+static cv::GComputation boundingRectTestGAPI(const In& in, cv::GCompileArgs&& args,
+                                             cv::Rect& out_rect_gapi)
+{
+    cv::detail::g_type_of_t<In> g_in;
+    auto out = cv::gapi::boundingRect(g_in);
+    cv::GComputation c(cv::GIn(g_in), cv::GOut(out));
+    c.apply(cv::gin(in), cv::gout(out_rect_gapi), std::move(args));
+    return c;
+}
+
+template<typename In>
+static void boundingRectTestOpenCVCompare(const In& in, const cv::Rect& out_rect_gapi,
+                                          const CompareRects& cmpF)
+{
+    // OpenCV code /////////////////////////////////////////////////////////////
+    cv::Rect out_rect_ocv = cv::boundingRect(in);
+    // Comparison //////////////////////////////////////////////////////////////
+    EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
+}
+
+template<typename In>
+static void boundingRectTestBody(const In& in, const CompareRects& cmpF, cv::GCompileArgs&& args)
+{
+    cv::Rect out_rect_gapi;
+    boundingRectTestGAPI(in, std::move(args), out_rect_gapi);
+    boundingRectTestOpenCVCompare(in, out_rect_gapi, cmpF);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename In>
+static cv::GComputation fitLineTestGAPI(const In& in, const cv::DistanceTypes distType,
+                                        cv::GCompileArgs&& args, cv::Vec4f& out_vec_gapi)
+{
+    const double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
+
+    cv::detail::g_type_of_t<In> g_in;
+    auto out = cv::gapi::fitLine2D(g_in, distType, paramDefault, repsDefault, aepsDefault);
+    cv::GComputation c(cv::GIn(g_in), cv::GOut(out));
+    c.apply(cv::gin(in), cv::gout(out_vec_gapi), std::move(args));
+    return c;
+}
+
+template<typename In>
+static cv::GComputation fitLineTestGAPI(const In& in, const cv::DistanceTypes distType,
+                                        cv::GCompileArgs&& args, cv::Vec6f& out_vec_gapi)
+{
+    const double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
+
+    cv::detail::g_type_of_t<In> g_in;
+    auto out = cv::gapi::fitLine3D(g_in, distType, paramDefault, repsDefault, aepsDefault);
+    cv::GComputation c(cv::GIn(g_in), cv::GOut(out));
+    c.apply(cv::gin(in), cv::gout(out_vec_gapi), std::move(args));
+    return c;
+}
+
+template<typename In, int dim>
+static void fitLineTestOpenCVCompare(const In& in, const cv::DistanceTypes distType,
+                                     const cv::Vec<float, dim>& out_vec_gapi,
+                                     const CompareVecs<float, dim>& cmpF)
+{
+    const double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    cv::Vec<float, dim> out_vec_ocv;
+    cv::fitLine(in, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
+    // Comparison //////////////////////////////////////////////////////////////
+    EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
+}
+
+template<typename In, int dim>
+static void fitLineTestBody(const In& in, const cv::DistanceTypes distType,
+                            const CompareVecs<float, dim>& cmpF, cv::GCompileArgs&& args)
+{
+    cv::Vec<float, dim> out_vec_gapi;
+    fitLineTestGAPI(in, distType, std::move(args), out_vec_gapi);
+    fitLineTestOpenCVCompare(in, distType, out_vec_gapi, cmpF);
+}
+} // namespace opencv_test
+
+#endif // OPENCV_GAPI_IMGPROC_TESTS_COMMON_HPP
diff --git a/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp b/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp
index 2a4f2e64ea..755d09a6e4 100644
--- a/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp
@@ -11,6 +11,8 @@
 #include <opencv2/gapi/imgproc.hpp>
 #include "gapi_imgproc_tests.hpp"
 
+#include "gapi_imgproc_tests_common.hpp"
+
 namespace opencv_test
 {
 
@@ -50,27 +52,6 @@ namespace
             rgb2yuyv(in_line_p, out_line_p, in.cols);
         }
     }
-
-    // Draw random ellipses on given mat of given size and type
-    void initMatForFindingContours(cv::Mat& mat, const cv::Size& sz, const int type)
-    {
-        cv::RNG& rng = theRNG();
-        mat = cv::Mat(sz, type, cv::Scalar::all(0));
-        size_t numEllipses = rng.uniform(1, 10);
-
-        for( size_t i = 0; i < numEllipses; i++ )
-        {
-            cv::Point center;
-            cv::Size  axes;
-            center.x     = rng.uniform(0, sz.width);
-            center.y     = rng.uniform(0, sz.height);
-            axes.width   = rng.uniform(2, sz.width);
-            axes.height  = rng.uniform(2, sz.height);
-            int color    = rng.uniform(1, 256);
-            double angle = rng.uniform(0., 180.);
-            cv::ellipse(mat, center, axes, angle, 0., 360., color, 1, FILLED);
-        }
-    }
 }
 
 TEST_P(Filter2DTest, AccuracyTest)
@@ -313,7 +294,7 @@ TEST_P(Dilate3x3Test, AccuracyTest)
 
 TEST_P(MorphologyExTest, AccuracyTest)
 {
-    MorphShapes defShape = cv::MORPH_RECT;
+    cv::MorphShapes defShape = cv::MORPH_RECT;
     int defKernSize = 3;
     cv::Mat kernel = cv::getStructuringElement(defShape, cv::Size(defKernSize, defKernSize));
 
@@ -493,29 +474,7 @@ TEST_P(GoodFeaturesTest, AccuracyTest)
 
 TEST_P(FindContoursNoOffsetTest, AccuracyTest)
 {
-    std::vector<std::vector<cv::Point>> outCtsOCV,  outCtsGAPI;
-
-    initMatForFindingContours(in_mat1, sz, type);
-    out_mat_gapi = cv::Mat(sz, type, cv::Scalar::all(0));
-    out_mat_ocv  = cv::Mat(sz, type, cv::Scalar::all(0));
-
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::findContours(in_mat1, outCtsOCV, mode, method);
-    }
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    cv::GArray<cv::GArray<cv::Point>> outCts;
-    outCts = cv::gapi::findContours(in, mode, method);
-    cv::GComputation c(GIn(in), GOut(outCts));
-    c.apply(gin(in_mat1), gout(outCtsGAPI), getCompileArgs());
-
-    // Comparison //////////////////////////////////////////////////////////////
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    cv::fillPoly(out_mat_ocv,  outCtsOCV,  cv::Scalar::all(1));
-    cv::fillPoly(out_mat_gapi, outCtsGAPI, cv::Scalar::all(1));
-    EXPECT_TRUE(AbsExact().to_compare_f()(out_mat_ocv, out_mat_gapi));
+    findContoursTestBody(sz, type, mode, method, cmpF, getCompileArgs());
 }
 
 TEST_P(FindContoursOffsetTest, AccuracyTest)
@@ -524,63 +483,15 @@ TEST_P(FindContoursOffsetTest, AccuracyTest)
     const MatType2 type = CV_8UC1;
     const cv::RetrievalModes mode = cv::RETR_EXTERNAL;
     const cv::ContourApproximationModes method = cv::CHAIN_APPROX_NONE;
+    const CompareMats cmpF = AbsExact().to_compare_obj();
     const cv::Point offset(15, 15);
-    std::vector<std::vector<cv::Point>> outCtsOCV,  outCtsGAPI;
 
-    initMatForFindingContours(in_mat1, sz, type);
-    out_mat_gapi = cv::Mat(sz, type, cv::Scalar::all(0));
-    out_mat_ocv  = cv::Mat(sz, type, cv::Scalar::all(0));
-
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::findContours(in_mat1, outCtsOCV, mode, method, offset);
-    }
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    GOpaque<Point> gOffset;
-    cv::GArray<cv::GArray<cv::Point>> outCts;
-    outCts = cv::gapi::findContours(in, mode, method, gOffset);
-    cv::GComputation c(GIn(in, gOffset), GOut(outCts));
-    c.apply(gin(in_mat1, offset), gout(outCtsGAPI), getCompileArgs());
-
-    // Comparison //////////////////////////////////////////////////////////////
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    cv::fillPoly(out_mat_ocv,  outCtsOCV,  cv::Scalar::all(1));
-    cv::fillPoly(out_mat_gapi, outCtsGAPI, cv::Scalar::all(1));
-    EXPECT_TRUE(AbsExact().to_compare_f()(out_mat_ocv, out_mat_gapi));
+    findContoursTestBody(sz, type, mode, method, cmpF, getCompileArgs(), offset);
 }
 
 TEST_P(FindContoursHNoOffsetTest, AccuracyTest)
 {
-    std::vector<std::vector<cv::Point>> outCtsOCV,  outCtsGAPI;
-    std::vector<cv::Vec4i>              outHierOCV, outHierGAPI;
-
-    initMatForFindingContours(in_mat1, sz, type);
-    out_mat_gapi = cv::Mat(sz, type, cv::Scalar::all(0));
-    out_mat_ocv  = cv::Mat(sz, type, cv::Scalar::all(0));
-
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::findContours(in_mat1, outCtsOCV, outHierOCV, mode, method);
-    }
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    cv::GArray<cv::GArray<cv::Point>> outCts;
-    cv::GArray<cv::Vec4i> outHier;
-    std::tie(outCts, outHier) = cv::gapi::findContoursH(in, mode, method);
-    cv::GComputation c(GIn(in), GOut(outCts, outHier));
-    c.apply(gin(in_mat1), gout(outCtsGAPI, outHierGAPI), getCompileArgs());
-
-    // Comparison //////////////////////////////////////////////////////////////
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    cv::fillPoly(out_mat_ocv,  outCtsOCV,  cv::Scalar::all(1));
-    cv::fillPoly(out_mat_gapi, outCtsGAPI, cv::Scalar::all(1));
-    EXPECT_TRUE(AbsExact().to_compare_f()(out_mat_ocv, out_mat_gapi));
-
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    EXPECT_TRUE(AbsExactVector<cv::Vec4i>().to_compare_f()(outHierOCV, outHierGAPI));
+    findContoursTestBody<HIERARCHY>(sz, type, mode, method, cmpF, getCompileArgs());
 }
 
 TEST_P(FindContoursHOffsetTest, AccuracyTest)
@@ -589,353 +500,100 @@ TEST_P(FindContoursHOffsetTest, AccuracyTest)
     const MatType2 type = CV_8UC1;
     const cv::RetrievalModes mode = cv::RETR_EXTERNAL;
     const cv::ContourApproximationModes method = cv::CHAIN_APPROX_NONE;
+    const CompareMats cmpF = AbsExact().to_compare_obj();
     const cv::Point offset(15, 15);
     std::vector<std::vector<cv::Point>> outCtsOCV,  outCtsGAPI;
     std::vector<cv::Vec4i>              outHierOCV, outHierGAPI;
 
-    initMatForFindingContours(in_mat1, sz, type);
-    out_mat_gapi = cv::Mat(sz, type, cv::Scalar::all(0));
-    out_mat_ocv  = cv::Mat(sz, type, cv::Scalar::all(0));
-
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::findContours(in_mat1, outCtsOCV, outHierOCV, mode, method, offset);
-    }
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    GOpaque<Point> gOffset;
-    cv::GArray<cv::GArray<cv::Point>> outCts;
-    cv::GArray<cv::Vec4i> outHier;
-    std::tie(outCts, outHier) = cv::gapi::findContoursH(in, mode, method, gOffset);
-    cv::GComputation c(GIn(in, gOffset), GOut(outCts, outHier));
-    c.apply(gin(in_mat1, offset), gout(outCtsGAPI, outHierGAPI), getCompileArgs());
-
-    // Comparison //////////////////////////////////////////////////////////////
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    cv::fillPoly(out_mat_ocv,  outCtsOCV,  cv::Scalar::all(1));
-    cv::fillPoly(out_mat_gapi, outCtsGAPI, cv::Scalar::all(1));
-    EXPECT_TRUE(AbsExact().to_compare_f()(out_mat_ocv, out_mat_gapi));
-
-    EXPECT_TRUE(outCtsGAPI.size() == outCtsOCV.size());
-    EXPECT_TRUE(AbsExactVector<cv::Vec4i>().to_compare_f()(outHierOCV, outHierGAPI));
+    findContoursTestBody<HIERARCHY>(sz, type, mode, method, cmpF, getCompileArgs(), offset);
 }
 
 TEST_P(BoundingRectMatTest, AccuracyTest)
 {
-    cv::Rect out_rect_gapi, out_rect_ocv;
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    auto out = cv::gapi::boundingRect(in);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_mat1), cv::gout(out_rect_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
+    if (initByVector)
     {
-        out_rect_ocv = cv::boundingRect(in_mat1);
+        initMatByPointsVectorRandU<cv::Point_>(type, sz, dtype);
     }
-    // Comparison //////////////////////////////////////////////////////////////
+    else
     {
-        EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
+        initMatrixRandU(type, sz, dtype);
     }
+    boundingRectTestBody(in_mat1, cmpF, getCompileArgs());
 }
 
-TEST_P(BoundingRectMatVector32STest, AccuracyTest)
-{
-    cv::Rect out_rect_gapi, out_rect_ocv;
-
-    std::vector<cv::Point2i> in_vectorS(sz.width);
-    cv::randu(in_vectorS, cv::Scalar::all(0), cv::Scalar::all(255));
-    in_mat1 = cv::Mat(in_vectorS);
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    auto out = cv::gapi::boundingRect(in);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_mat1), cv::gout(out_rect_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        out_rect_ocv = cv::boundingRect(in_mat1);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
-    }
-}
-
-TEST_P(BoundingRectMatVector32FTest, AccuracyTest)
-{
-    cv::RNG& rng = theRNG();
-    cv::Rect out_rect_gapi, out_rect_ocv;
-
-    std::vector<cv::Point2f> in_vectorF(sz.width);
-    const int fscale = 256;  // avoid bits near ULP, generate stable test input
-    for (int i = 0; i < sz.width; i++)
-    {
-        cv::Point2f pt(rng.uniform(0, 255 * fscale) / static_cast<float>(fscale),
-                       rng.uniform(0, 255 * fscale) / static_cast<float>(fscale));
-        in_vectorF.push_back(pt);
-    }
-    in_mat1 = cv::Mat(in_vectorF);
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    auto out = cv::gapi::boundingRect(in);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_mat1), cv::gout(out_rect_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        out_rect_ocv = cv::boundingRect(in_mat1);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
-    }
-}
-
-
 TEST_P(BoundingRectVector32STest, AccuracyTest)
+
 {
-    cv::Rect out_rect_gapi, out_rect_ocv;
+    std::vector<cv::Point2i> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
 
-    std::vector<cv::Point2i> in_vectorS(sz.width);
-    cv::randu(in_vectorS, cv::Scalar::all(0), cv::Scalar::all(255));
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point2i> in;
-    auto out = cv::gapi::boundingRect(in);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vectorS), cv::gout(out_rect_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        out_rect_ocv = cv::boundingRect(in_vectorS);
-    }
-
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
-    }
+    boundingRectTestBody(in_vector, cmpF, getCompileArgs());
 }
 
 TEST_P(BoundingRectVector32FTest, AccuracyTest)
 {
-    cv::RNG& rng = theRNG();
-    cv::Rect out_rect_gapi, out_rect_ocv;
+    std::vector<cv::Point2f> in_vector;
+    initPointsVectorRandU(sz.width, in_vector);
 
-    std::vector<cv::Point2f> in_vectorF(sz.width);
-    const int fscale = 256;  // avoid bits near ULP, generate stable test input
-    for (int i = 0; i < sz.width; i++)
-    {
-        cv::Point2f pt(rng.uniform(0, 255 * fscale) / static_cast<float>(fscale),
-                       rng.uniform(0, 255 * fscale) / static_cast<float>(fscale));
-        in_vectorF.push_back(pt);
-    }
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point2f> in;
-    auto out = cv::gapi::boundingRect(in);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vectorF), cv::gout(out_rect_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        out_rect_ocv = cv::boundingRect(in_vectorF);
-    }
-
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_rect_gapi, out_rect_ocv));
-    }
+    boundingRectTestBody(in_vector, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine2DMatVectorTest, AccuracyTest)
 {
-    cv::Vec4f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    auto out = cv::gapi::fitLine2D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_mat1), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_mat1, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_mat1, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine2DVector32STest, AccuracyTest)
 {
-    cv::Vec4f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point2i> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point2i> in;
-    auto out = cv::gapi::fitLine2D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine2DVector32FTest, AccuracyTest)
 {
-    cv::Vec4f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point2f> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point2f> in;
-    auto out = cv::gapi::fitLine2D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine2DVector64FTest, AccuracyTest)
 {
-    cv::Vec4f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point2d> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point2d> in;
-    auto out = cv::gapi::fitLine2D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine3DMatVectorTest, AccuracyTest)
 {
-    cv::Vec6f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GMat in;
-    auto out = cv::gapi::fitLine3D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_mat1), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_mat1, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_mat1, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine3DVector32STest, AccuracyTest)
 {
-    cv::Vec6f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point3i> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point3i> in;
-    auto out = cv::gapi::fitLine3D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine3DVector32FTest, AccuracyTest)
 {
-    cv::Vec6f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point3f> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point3f> in;
-    auto out = cv::gapi::fitLine3D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(FitLine3DVector64FTest, AccuracyTest)
 {
-    cv::Vec6f out_vec_gapi, out_vec_ocv;
-    double paramDefault = 0., repsDefault = 0., aepsDefault = 0.;
-
     std::vector<cv::Point3d> in_vec;
     initPointsVectorRandU(sz.width, in_vec);
 
-    // G-API code //////////////////////////////////////////////////////////////
-    cv::GArray<cv::Point3d> in;
-    auto out = cv::gapi::fitLine3D(in, distType, paramDefault, repsDefault, aepsDefault);
-
-    cv::GComputation c(cv::GIn(in), cv::GOut(out));
-    c.apply(cv::gin(in_vec), cv::gout(out_vec_gapi), getCompileArgs());
-    // OpenCV code /////////////////////////////////////////////////////////////
-    {
-        cv::fitLine(in_vec, out_vec_ocv, distType, paramDefault, repsDefault, aepsDefault);
-    }
-    // Comparison //////////////////////////////////////////////////////////////
-    {
-        EXPECT_TRUE(cmpF(out_vec_gapi, out_vec_ocv));
-    }
+    fitLineTestBody(in_vec, distType, cmpF, getCompileArgs());
 }
 
 TEST_P(BGR2RGBTest, AccuracyTest)
diff --git a/modules/gapi/test/common/gapi_render_tests.hpp b/modules/gapi/test/common/gapi_render_tests.hpp
index 1f28e92a69..30caca9e6d 100644
--- a/modules/gapi/test/common/gapi_render_tests.hpp
+++ b/modules/gapi/test/common/gapi_render_tests.hpp
@@ -130,6 +130,8 @@ struct Fixture : public RenderBGRTestBase API {                  \
 #define GAPI_RENDER_TEST_FIXTURES(Fixture, API, Number, ...)                    \
     GAPI_RENDER_TEST_FIXTURE_BGR(RenderBGR##Fixture,   GET_VA_ARGS(API), Number, __VA_ARGS__) \
     GAPI_RENDER_TEST_FIXTURE_NV12(RenderNV12##Fixture, GET_VA_ARGS(API), Number, __VA_ARGS__) \
+    GAPI_RENDER_TEST_FIXTURE_NV12(RenderMFrame##Fixture, GET_VA_ARGS(API), Number, __VA_ARGS__) \
+
 
 using Points = std::vector<cv::Point>;
 GAPI_RENDER_TEST_FIXTURES(TestTexts,     FIXTURE_API(std::string, cv::Point, double, cv::Scalar), 4, text, org, fs, color)
diff --git a/modules/gapi/test/common/gapi_stereo_tests.cpp b/modules/gapi/test/common/gapi_stereo_tests.cpp
new file mode 100644
index 0000000000..12114229e3
--- /dev/null
+++ b/modules/gapi/test/common/gapi_stereo_tests.cpp
@@ -0,0 +1,8 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include "../test_precomp.hpp"
+#include "gapi_stereo_tests_inl.hpp"
diff --git a/modules/gapi/test/common/gapi_stereo_tests.hpp b/modules/gapi/test/common/gapi_stereo_tests.hpp
new file mode 100644
index 0000000000..576f23024c
--- /dev/null
+++ b/modules/gapi/test/common/gapi_stereo_tests.hpp
@@ -0,0 +1,26 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_STEREO_TESTS_HPP
+#define OPENCV_GAPI_STEREO_TESTS_HPP
+
+
+#include <opencv2/gapi/stereo.hpp> // fore cv::gapi::StereoOutputFormat
+
+#include "gapi_tests_common.hpp"
+#include "gapi_parsers_tests_common.hpp"
+
+namespace opencv_test
+{
+
+GAPI_TEST_FIXTURE(TestGAPIStereo, initMatsRandU, FIXTURE_API(cv::gapi::StereoOutputFormat, int, int, double, double, CompareMats), 6,
+                                                             oF, numDisparities, blockSize, baseline,
+                                                             focus, cmpF)
+
+} // namespace opencv_test
+
+#endif // OPENCV_GAPI_STEREO_TESTS_HPP
diff --git a/modules/gapi/test/common/gapi_stereo_tests_inl.hpp b/modules/gapi/test/common/gapi_stereo_tests_inl.hpp
new file mode 100644
index 0000000000..e1d0793ec8
--- /dev/null
+++ b/modules/gapi/test/common/gapi_stereo_tests_inl.hpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STEREO_TESTS_INL_HPP
+#define OPENCV_GAPI_STEREO_TESTS_INL_HPP
+
+
+#include <opencv2/gapi/stereo.hpp>
+#include <opencv2/gapi/cpu/stereo.hpp>
+#include "gapi_stereo_tests.hpp"
+
+#ifdef HAVE_OPENCV_STEREO
+
+#include <opencv2/stereo.hpp>
+
+namespace opencv_test {
+
+TEST_P(TestGAPIStereo, DisparityDepthTest)
+{
+    using format = cv::gapi::StereoOutputFormat;
+    switch(oF) {
+        case format::DEPTH_FLOAT16: dtype = CV_16FC1; break;
+        case format::DEPTH_FLOAT32: dtype = CV_32FC1; break;
+        case format::DISPARITY_FIXED16_12_4: dtype = CV_16SC1; break;
+        default: GAPI_Assert(false && "Unsupported format in test");
+    }
+    initOutMats(sz, dtype);
+
+    // G-API
+    cv::GMat inL, inR;
+    cv::GMat out = cv::gapi::stereo(inL, inR, oF);
+
+    cv::GComputation(cv::GIn(inL, inR), cv::GOut(out))
+        .apply(cv::gin(in_mat1, in_mat2), cv::gout(out_mat_gapi),
+        cv::compile_args(cv::gapi::calib3d::cpu::kernels(),
+                         cv::gapi::calib3d::cpu::StereoInitParam {
+                             numDisparities,
+                             blockSize,
+                             baseline,
+                             focus}));
+
+    // OpenCV
+    cv::StereoBM::create(numDisparities, blockSize)->compute(in_mat1,
+                                                             in_mat2,
+                                                             out_mat_ocv);
+
+    static const int DISPARITY_SHIFT_16S = 4;
+    switch(oF) {
+        case format::DEPTH_FLOAT16:
+            out_mat_ocv.convertTo(out_mat_ocv, CV_32FC1, 1./(1 << DISPARITY_SHIFT_16S), 0);
+            out_mat_ocv = (focus * baseline) / out_mat_ocv;
+            out_mat_ocv.convertTo(out_mat_ocv, CV_16FC1);
+            break;
+        case format::DEPTH_FLOAT32:
+            out_mat_ocv.convertTo(out_mat_ocv, CV_32FC1, 1./(1 << DISPARITY_SHIFT_16S), 0);
+            out_mat_ocv = (focus * baseline) / out_mat_ocv;
+            break;
+        case format::DISPARITY_FIXED16_12_4:
+            break;
+        default:
+            GAPI_Assert(false && "Unsupported format in test");
+    }
+
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+}
+
+} // namespace opencv_test
+
+#endif // HAVE_OPENCV_STEREO
+
+#endif // OPENCV_GAPI_STEREO_TESTS_INL_HPP
diff --git a/modules/gapi/test/common/gapi_tests_common.hpp b/modules/gapi/test/common/gapi_tests_common.hpp
index 514fa2be38..777574964a 100644
--- a/modules/gapi/test/common/gapi_tests_common.hpp
+++ b/modules/gapi/test/common/gapi_tests_common.hpp
@@ -58,7 +58,7 @@ namespace
         return o;
     }
 
-    inline void initTestDataPath()
+    inline bool initTestDataPathSilent()
     {
 #ifndef WINRT
         static bool initialized = false;
@@ -66,15 +66,32 @@ namespace
         {
             // Since G-API has no own test data (yet), it is taken from the common space
             const char* testDataPath = getenv("OPENCV_TEST_DATA_PATH");
-            GAPI_Assert(testDataPath != nullptr &&
-            "OPENCV_TEST_DATA_PATH environment variable is either not set or set incorrectly.");
-
-            cvtest::addDataSearchPath(testDataPath);
-            initialized = true;
+            if (testDataPath != nullptr) {
+                cvtest::addDataSearchPath(testDataPath);
+                initialized = true;
+            }
         }
+
+        return initialized;
 #endif // WINRT
     }
 
+    inline void initTestDataPath()
+    {
+        bool initialized = initTestDataPathSilent();
+        GAPI_Assert(initialized &&
+            "OPENCV_TEST_DATA_PATH environment variable is either not set or set incorrectly.");
+    }
+
+    inline void initTestDataPathOrSkip()
+    {
+        bool initialized = initTestDataPathSilent();
+        if (!initialized)
+        {
+            throw cvtest::SkipTestException("Can't find test data");
+        }
+    }
+
     template <typename T> inline void initPointRandU(cv::RNG &rng, cv::Point_<T>& pt)
     {
         GAPI_Assert(std::is_integral<T>::value);
@@ -324,7 +341,7 @@ public:
     }
 
     template <typename T>
-    inline void initPointRandU(cv::RNG& rng, T& pt)
+    inline void initPointRandU(cv::RNG& rng, T& pt) const
     { ::initPointRandU(rng, pt); }
 
 // Disable unreachable code warning for MSVS 2015
@@ -334,7 +351,7 @@ public:
 #endif
     // initialize std::vector<cv::Point_<T>>/std::vector<cv::Point3_<T>>
     template <typename T, template <typename> class Pt>
-    void initPointsVectorRandU(const int sz_in, std::vector<Pt<T>> &vec_)
+    void initPointsVectorRandU(const int sz_in, std::vector<Pt<T>> &vec_) const
     {
         cv::RNG& rng = theRNG();
 
@@ -593,6 +610,8 @@ using compare_vec_f = std::function<bool(const cv::Vec<Elem, cn> &a, const cv::V
 template<typename T1, typename T2>
 struct CompareF
 {
+    CompareF() = default;
+
     using callable_t = std::function<bool(const T1& a, const T2& b)>;
     CompareF(callable_t&& cmp, std::string&& cmp_name) :
         _comparator(std::move(cmp)), _name(std::move(cmp_name)) {}
@@ -1174,6 +1193,28 @@ inline std::ostream& operator<<(std::ostream& os, DistanceTypes op)
 #undef CASE
     return os;
 }
+
+inline std::ostream& operator<<(std::ostream& os, KmeansFlags op)
+{
+    int op_(op);
+    switch (op_)
+    {
+    case KmeansFlags::KMEANS_RANDOM_CENTERS:
+        os << "KMEANS_RANDOM_CENTERS";
+        break;
+    case KmeansFlags::KMEANS_PP_CENTERS:
+        os << "KMEANS_PP_CENTERS";
+        break;
+    case KmeansFlags::KMEANS_RANDOM_CENTERS | KmeansFlags::KMEANS_USE_INITIAL_LABELS:
+        os << "KMEANS_RANDOM_CENTERS | KMEANS_USE_INITIAL_LABELS";
+        break;
+    case KmeansFlags::KMEANS_PP_CENTERS | KmeansFlags::KMEANS_USE_INITIAL_LABELS:
+        os << "KMEANS_PP_CENTERS | KMEANS_USE_INITIAL_LABELS";
+        break;
+    default: GAPI_Assert(false && "unknown KmeansFlags value");
+    }
+    return os;
+}
 }  // namespace cv
 
 #endif //OPENCV_GAPI_TESTS_COMMON_HPP
diff --git a/modules/gapi/test/common/gapi_video_tests.hpp b/modules/gapi/test/common/gapi_video_tests.hpp
index df57bf4a0f..78f0b69657 100644
--- a/modules/gapi/test/common/gapi_video_tests.hpp
+++ b/modules/gapi/test/common/gapi_video_tests.hpp
@@ -28,6 +28,16 @@ GAPI_TEST_FIXTURE_SPEC_PARAMS(BuildPyr_CalcOptFlow_PipelineTest,
                               FIXTURE_API(std::string,int,int,bool), 4,
                               fileNamePattern, winSize, maxLevel, withDerivatives)
 
+GAPI_TEST_FIXTURE_SPEC_PARAMS(BackgroundSubtractorTest, FIXTURE_API(tuple<cv::gapi::video::BackgroundSubtractorType,double>,
+                                                                    int, bool, double, std::string, std::size_t),
+                              6, typeAndThreshold, histLength, detectShadows, learningRate, filePath, testNumFrames)
+
+GAPI_TEST_FIXTURE_SPEC_PARAMS(KalmanFilterTest, FIXTURE_API(int, int, int, int, int), 5, type, dDim, mDim, cDim, numIter)
+
+GAPI_TEST_FIXTURE_SPEC_PARAMS(KalmanFilterNoControlTest, FIXTURE_API(int, int, int, int), 4, type, dDim, mDim, numIter)
+
+GAPI_TEST_FIXTURE_SPEC_PARAMS(KalmanFilterCircleSampleTest, FIXTURE_API(int, int), 2, type, numIter)
+
 } // opencv_test
 
 
diff --git a/modules/gapi/test/common/gapi_video_tests_common.hpp b/modules/gapi/test/common/gapi_video_tests_common.hpp
index c12a817b2e..f3e7e5aa6c 100644
--- a/modules/gapi/test/common/gapi_video_tests_common.hpp
+++ b/modules/gapi/test/common/gapi_video_tests_common.hpp
@@ -15,7 +15,6 @@
 #endif // HAVE_OPENCV_VIDEO
 
 
-
 namespace opencv_test
 {
 namespace
@@ -128,6 +127,54 @@ struct OptFlowLKTestParams
     int flags                     = 0;
 };
 
+inline void compareOutputPyramids(const BuildOpticalFlowPyramidTestOutput& outGAPI,
+                                  const BuildOpticalFlowPyramidTestOutput& outOCV)
+{
+    GAPI_Assert(outGAPI.maxLevel == outOCV.maxLevel);
+    GAPI_Assert(outOCV.maxLevel >= 0);
+    const size_t maxLevel = static_cast<size_t>(outOCV.maxLevel);
+    for (size_t i = 0; i <= maxLevel; i++)
+    {
+        EXPECT_TRUE(AbsExact().to_compare_f()(outGAPI.pyramid[i], outOCV.pyramid[i]));
+    }
+}
+
+template <typename Elem>
+inline bool compareVectorsAbsExactForOptFlow(const std::vector<Elem>& outGAPI,
+                                             const std::vector<Elem>& outOCV)
+{
+    return AbsExactVector<Elem>().to_compare_f()(outGAPI, outOCV);
+}
+
+inline void compareOutputsOptFlow(const OptFlowLKTestOutput& outGAPI,
+                                  const OptFlowLKTestOutput& outOCV)
+{
+    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.nextPoints, outOCV.nextPoints));
+    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.statuses,   outOCV.statuses));
+    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.errors,     outOCV.errors));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const cv::TermCriteria& criteria)
+{
+    os << "{";
+    switch (criteria.type) {
+    case cv::TermCriteria::COUNT:
+        os << "COUNT; ";
+        break;
+    case cv::TermCriteria::EPS:
+        os << "EPS; ";
+        break;
+    case cv::TermCriteria::COUNT | cv::TermCriteria::EPS:
+        os << "COUNT | EPS; ";
+        break;
+    default:
+        os << "TypeUndefined; ";
+        break;
+    };
+
+    return os << criteria.maxCount << "; " << criteria.epsilon <<"}";
+}
+
 #ifdef HAVE_OPENCV_VIDEO
 
 inline GComputation runOCVnGAPIBuildOptFlowPyramid(TestFunctional& testInst,
@@ -321,6 +368,66 @@ inline GComputation runOCVnGAPIOptFlowPipeline(TestFunctional& testInst,
     return c;
 }
 
+inline void testBackgroundSubtractorStreaming(cv::GStreamingCompiled& gapiBackSub,
+                                              const cv::Ptr<cv::BackgroundSubtractor>& pOCVBackSub,
+                                              const int diffPercent, const int tolerance,
+                                              const double lRate, const std::size_t testNumFrames)
+{
+    cv::Mat frame, gapiForeground, ocvForeground;
+    double numDiff = diffPercent / 100.0;
+
+    gapiBackSub.start();
+    EXPECT_TRUE(gapiBackSub.running());
+
+    compare_f cmpF = AbsSimilarPoints(tolerance, numDiff).to_compare_f();
+
+    // Comparison of G-API and OpenCV substractors
+    std::size_t frames = 0u;
+    while (frames <= testNumFrames && gapiBackSub.pull(cv::gout(frame, gapiForeground)))
+    {
+        pOCVBackSub->apply(frame, ocvForeground, lRate);
+        EXPECT_TRUE(cmpF(gapiForeground, ocvForeground));
+        frames++;
+    }
+
+    if (gapiBackSub.running())
+        gapiBackSub.stop();
+
+    EXPECT_LT(0u, frames);
+    EXPECT_FALSE(gapiBackSub.running());
+}
+
+inline void initKalmanParams(const int type, const int dDim, const int mDim, const int cDim,
+                             cv::gapi::KalmanParams& kp)
+{
+    kp.state = Mat::zeros(dDim, 1, type);
+    cv::randu(kp.state, Scalar::all(0), Scalar::all(0.1));
+    kp.errorCov = Mat::eye(dDim, dDim, type);
+
+    kp.transitionMatrix = Mat::ones(dDim, dDim, type) * 2;
+    kp.processNoiseCov = Mat::eye(dDim, dDim, type) * (1e-5);
+    kp.measurementMatrix = Mat::eye(mDim, dDim, type) * 2;
+    kp.measurementNoiseCov = Mat::eye(mDim, mDim, type) * (1e-5);
+
+    if (cDim > 0)
+        kp.controlMatrix = Mat::eye(dDim, cDim, type) * (1e-3);
+}
+
+inline void initKalmanFilter(const cv::gapi::KalmanParams& kp, const bool control,
+                             cv::KalmanFilter& ocvKalman)
+{
+    kp.state.copyTo(ocvKalman.statePost);
+    kp.errorCov.copyTo(ocvKalman.errorCovPost);
+
+    kp.transitionMatrix.copyTo(ocvKalman.transitionMatrix);
+    kp.measurementMatrix.copyTo(ocvKalman.measurementMatrix);
+    kp.measurementNoiseCov.copyTo(ocvKalman.measurementNoiseCov);
+    kp.processNoiseCov.copyTo(ocvKalman.processNoiseCov);
+
+    if (control)
+        kp.controlMatrix.copyTo(ocvKalman.controlMatrix);
+}
+
 #else // !HAVE_OPENCV_VIDEO
 
 inline cv::GComputation runOCVnGAPIBuildOptFlowPyramid(TestFunctional&,
@@ -361,54 +468,24 @@ inline GComputation runOCVnGAPIOptFlowPipeline(TestFunctional&,
 
 #endif // HAVE_OPENCV_VIDEO
 
-inline void compareOutputPyramids(const BuildOpticalFlowPyramidTestOutput& outOCV,
-                                  const BuildOpticalFlowPyramidTestOutput& outGAPI)
-{
-    GAPI_Assert(outGAPI.maxLevel == outOCV.maxLevel);
-    GAPI_Assert(outOCV.maxLevel >= 0);
-    size_t maxLevel = static_cast<size_t>(outOCV.maxLevel);
-    for (size_t i = 0; i <= maxLevel; i++)
-    {
-        EXPECT_TRUE(AbsExact().to_compare_f()(outOCV.pyramid[i], outGAPI.pyramid[i]));
-    }
-}
-
-template <typename Elem>
-inline bool compareVectorsAbsExactForOptFlow(std::vector<Elem> outOCV, std::vector<Elem> outGAPI)
-{
-    return AbsExactVector<Elem>().to_compare_f()(outOCV, outGAPI);
-}
-
-inline void compareOutputsOptFlow(const OptFlowLKTestOutput& outOCV,
-                                  const OptFlowLKTestOutput& outGAPI)
-{
-    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.nextPoints, outOCV.nextPoints));
-    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.statuses,   outOCV.statuses));
-    EXPECT_TRUE(compareVectorsAbsExactForOptFlow(outGAPI.errors,     outOCV.errors));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const cv::TermCriteria& criteria)
-{
-    os << "{";
-    switch (criteria.type) {
-    case cv::TermCriteria::COUNT:
-        os << "COUNT; ";
-        break;
-    case cv::TermCriteria::EPS:
-        os << "EPS; ";
-        break;
-    case cv::TermCriteria::COUNT | cv::TermCriteria::EPS:
-        os << "COUNT | EPS; ";
-        break;
-    default:
-        os << "TypeUndefined; ";
-        break;
-    };
-
-    return os << criteria.maxCount << "; " << criteria.epsilon <<"}";
-}
 } // namespace
 } // namespace opencv_test
 
+// Note: namespace must match the namespace of the type of the printed object
+namespace cv { namespace gapi { namespace video
+{
+inline std::ostream& operator<<(std::ostream& os, const BackgroundSubtractorType op)
+{
+#define CASE(v) case BackgroundSubtractorType::v: os << #v; break
+    switch (op)
+    {
+        CASE(TYPE_BS_MOG2);
+        CASE(TYPE_BS_KNN);
+        default: GAPI_Assert(false && "unknown BackgroundSubtractor type");
+    }
+#undef CASE
+    return os;
+}
+}}} // namespace cv::gapi::video
 
 #endif // OPENCV_GAPI_VIDEO_TESTS_COMMON_HPP
diff --git a/modules/gapi/test/common/gapi_video_tests_inl.hpp b/modules/gapi/test/common/gapi_video_tests_inl.hpp
index 965c06a328..7827680561 100644
--- a/modules/gapi/test/common/gapi_video_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_video_tests_inl.hpp
@@ -8,6 +8,7 @@
 #define OPENCV_GAPI_VIDEO_TESTS_INL_HPP
 
 #include "gapi_video_tests.hpp"
+#include <opencv2/gapi/streaming/cap.hpp>
 
 namespace opencv_test
 {
@@ -26,7 +27,7 @@ TEST_P(BuildOptFlowPyramidTest, AccuracyTest)
 
     runOCVnGAPIBuildOptFlowPyramid(*this, params, outOCV, outGAPI);
 
-    compareOutputPyramids(outOCV, outGAPI);
+    compareOutputPyramids(outGAPI, outOCV);
 }
 
 TEST_P(OptFlowLKTest, AccuracyTest)
@@ -43,7 +44,7 @@ TEST_P(OptFlowLKTest, AccuracyTest)
 
     runOCVnGAPIOptFlowLK(*this, inPts, params, outOCV, outGAPI);
 
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 }
 
 TEST_P(OptFlowLKTestForPyr, AccuracyTest)
@@ -62,7 +63,7 @@ TEST_P(OptFlowLKTestForPyr, AccuracyTest)
 
     runOCVnGAPIOptFlowLKForPyr(*this, in, params, withDeriv, outOCV, outGAPI);
 
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 }
 
 TEST_P(BuildPyr_CalcOptFlow_PipelineTest, AccuracyTest)
@@ -85,9 +86,238 @@ TEST_P(BuildPyr_CalcOptFlow_PipelineTest, AccuracyTest)
 
     runOCVnGAPIOptFlowPipeline(*this, params, outOCV, outGAPI, inPts);
 
-    compareOutputsOptFlow(outOCV, outGAPI);
+    compareOutputsOptFlow(outGAPI, outOCV);
 }
 
+#ifdef HAVE_OPENCV_VIDEO
+TEST_P(BackgroundSubtractorTest, AccuracyTest)
+{
+    initTestDataPath();
+
+    cv::gapi::video::BackgroundSubtractorType opType;
+    double thr = -1;
+    std::tie(opType, thr) = typeAndThreshold;
+
+    cv::gapi::video::BackgroundSubtractorParams bsp(opType, histLength, thr,
+                                                    detectShadows, learningRate);
+
+    // G-API graph declaration
+    cv::GMat in;
+    cv::GMat out = cv::gapi::BackgroundSubtractor(in, bsp);
+    // Preserving 'in' in output to have possibility to compare with OpenCV reference
+    cv::GComputation c(cv::GIn(in), cv::GOut(cv::gapi::copy(in), out));
+
+    // G-API compilation of graph for streaming mode
+    auto gapiBackSub = c.compileStreaming(getCompileArgs());
+
+    // Testing G-API Background Substractor in streaming mode
+    const auto path = findDataFile(filePath);
+    try
+    {
+        gapiBackSub.setSource(gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path));
+    }
+    catch (...)
+    { throw SkipTestException("Video file can't be opened."); }
+
+    cv::Ptr<cv::BackgroundSubtractor> pOCVBackSub;
+
+    if (opType == cv::gapi::video::TYPE_BS_MOG2)
+        pOCVBackSub = cv::createBackgroundSubtractorMOG2(histLength, thr,
+                                                         detectShadows);
+    else if (opType == cv::gapi::video::TYPE_BS_KNN)
+        pOCVBackSub = cv::createBackgroundSubtractorKNN(histLength, thr,
+                                                        detectShadows);
+
+    // Allowing 1% difference of all pixels between G-API and reference OpenCV results
+    testBackgroundSubtractorStreaming(gapiBackSub, pOCVBackSub, 1, 1, learningRate, testNumFrames);
+}
+
+TEST_P(KalmanFilterTest, AccuracyTest)
+{
+    cv::gapi::KalmanParams kp;
+    initKalmanParams(type, dDim, mDim, cDim, kp);
+
+    // OpenCV reference KalmanFilter initialization
+    cv::KalmanFilter ocvKalman(dDim, mDim, cDim, type);
+    initKalmanFilter(kp, true, ocvKalman);
+
+    // measurement vector
+    cv::Mat measure_vec(mDim, 1, type);
+
+    // control vector
+    cv::Mat ctrl_vec = Mat::zeros(cDim > 0 ? cDim : 2, 1, type);
+
+    // G-API Kalman's output state
+    cv::Mat gapiKState(dDim, 1, type);
+    // OCV Kalman's output state
+    cv::Mat ocvKState(dDim, 1, type);
+
+    // G-API graph initialization
+    cv::GMat m, ctrl;
+    cv::GOpaque<bool> have_m;
+    cv::GMat out = cv::gapi::KalmanFilter(m, have_m, ctrl, kp);
+    cv::GComputation comp(cv::GIn(m, have_m, ctrl), cv::GOut(out));
+
+    cv::RNG& rng = cv::theRNG();
+    bool haveMeasure;
+
+    for (int i = 0; i < numIter; i++)
+    {
+        haveMeasure = (rng(2u) == 1); // returns 0 or 1 - whether we have measurement at this iteration or not
+
+        if (haveMeasure)
+            cv::randu(measure_vec, Scalar::all(-1), Scalar::all(1));
+        if (cDim > 0)
+            cv::randu(ctrl_vec, Scalar::all(-1), Scalar::all(1));
+
+        // G-API KalmanFilter call
+        comp.apply(cv::gin(measure_vec, haveMeasure, ctrl_vec), cv::gout(gapiKState));
+        // OpenCV KalmanFilter call
+        ocvKState = cDim > 0 ? ocvKalman.predict(ctrl_vec) : ocvKalman.predict();
+        if (haveMeasure)
+            ocvKState = ocvKalman.correct(measure_vec);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(AbsExact().to_compare_f()(gapiKState, ocvKState));
+    }
+}
+
+TEST_P(KalmanFilterNoControlTest, AccuracyTest)
+{
+    cv::gapi::KalmanParams kp;
+    initKalmanParams(type, dDim, mDim, 0, kp);
+
+    // OpenCV reference KalmanFilter initialization
+    cv::KalmanFilter ocvKalman(dDim, mDim, 0, type);
+    initKalmanFilter(kp, false, ocvKalman);
+
+    // measurement vector
+    cv::Mat measure_vec(mDim, 1, type);
+
+    // G-API Kalman's output state
+    cv::Mat gapiKState(dDim, 1, type);
+    // OCV Kalman's output state
+    cv::Mat ocvKState(dDim, 1, type);
+
+    // G-API graph initialization
+    cv::GMat m;
+    cv::GOpaque<bool> have_m;
+    cv::GMat out = cv::gapi::KalmanFilter(m, have_m, kp);
+    cv::GComputation comp(cv::GIn(m, have_m), cv::GOut(out));
+
+    cv::RNG& rng = cv::theRNG();
+    bool haveMeasure;
+
+    for (int i = 0; i < numIter; i++)
+    {
+        haveMeasure = (rng(2u) == 1); // returns 0 or 1 - whether we have measurement at this iteration or not
+
+        if (haveMeasure)
+            cv::randu(measure_vec, Scalar::all(-1), Scalar::all(1));
+
+        // G-API
+        comp.apply(cv::gin(measure_vec, haveMeasure), cv::gout(gapiKState));
+
+        // OpenCV
+        ocvKState = ocvKalman.predict();
+        if (haveMeasure)
+            ocvKState = ocvKalman.correct(measure_vec);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+            EXPECT_TRUE(AbsExact().to_compare_f()(gapiKState, ocvKState));
+    }
+}
+
+TEST_P(KalmanFilterCircleSampleTest, AccuracyTest)
+{
+    // auxiliary variables
+    cv::Mat processNoise(2, 1, type);
+
+    // Input measurement
+    cv::Mat measurement = Mat::zeros(1, 1, type);
+    // Angle and it's delta(phi, delta_phi)
+    cv::Mat state(2, 1, type);
+
+    // G-API graph initialization
+    cv::gapi::KalmanParams kp;
+
+    kp.state = Mat::zeros(2, 1, type);
+    cv::randn(kp.state, Scalar::all(0), Scalar::all(0.1));
+
+    kp.errorCov = Mat::eye(2, 2, type);
+
+    if (type == CV_32F)
+        kp.transitionMatrix = (Mat_<float>(2, 2) << 1, 1, 0, 1);
+    else
+        kp.transitionMatrix = (Mat_<double>(2, 2) << 1, 1, 0, 1);
+
+    kp.processNoiseCov = Mat::eye(2, 2, type) * (1e-5);
+    kp.measurementMatrix = Mat::eye(1, 2, type);
+    kp.measurementNoiseCov = Mat::eye(1, 1, type) * (1e-1);
+
+    cv::GMat m;
+    cv::GOpaque<bool> have_measure;
+    cv::GMat out = cv::gapi::KalmanFilter(m, have_measure, kp);
+    cv::GComputation comp(cv::GIn(m, have_measure), cv::GOut(out));
+
+    // OCV Kalman initialization
+    cv::KalmanFilter KF(2, 1, 0);
+    initKalmanFilter(kp, false, KF);
+
+    cv::randn(state, Scalar::all(0), Scalar::all(0.1));
+
+    // GAPI Corrected state
+    cv::Mat gapiState(2, 1, type);
+    // OCV Corrected state
+    cv::Mat ocvCorrState(2, 1, type);
+    // OCV Predicted state
+    cv::Mat ocvPreState(2, 1, type);
+
+    bool haveMeasure;
+
+    for (int i = 0; i < numIter; ++i)
+    {
+        // Get OCV Prediction
+        ocvPreState = KF.predict();
+
+        GAPI_DbgAssert(cv::norm(kp.measurementNoiseCov, KF.measurementNoiseCov, cv::NORM_INF) == 0);
+        // generation measurement
+        cv::randn(measurement, Scalar::all(0), Scalar::all((type == CV_32FC1) ?
+                  kp.measurementNoiseCov.at<float>(0) : kp.measurementNoiseCov.at<double>(0)));
+
+        GAPI_DbgAssert(cv::norm(kp.measurementMatrix, KF.measurementMatrix, cv::NORM_INF) == 0);
+        measurement += kp.measurementMatrix*state;
+
+        if (cv::theRNG().uniform(0, 4) != 0)
+        {
+            haveMeasure = true;
+            ocvCorrState = KF.correct(measurement);
+            comp.apply(cv::gin(measurement, haveMeasure), cv::gout(gapiState));
+            EXPECT_TRUE(AbsExact().to_compare_f()(gapiState, ocvCorrState));
+        }
+        else
+        {
+            // Get GAPI Prediction
+            haveMeasure = false;
+            comp.apply(cv::gin(measurement, haveMeasure), cv::gout(gapiState));
+            EXPECT_TRUE(AbsExact().to_compare_f()(gapiState, ocvPreState));
+        }
+
+        GAPI_DbgAssert(cv::norm(kp.processNoiseCov, KF.processNoiseCov, cv::NORM_INF) == 0);
+        cv::randn(processNoise, Scalar(0), Scalar::all(sqrt(type == CV_32FC1 ?
+                                                       kp.processNoiseCov.at<float>(0, 0):
+                                                       kp.processNoiseCov.at<double>(0, 0))));
+
+        GAPI_DbgAssert(cv::norm(kp.transitionMatrix, KF.transitionMatrix, cv::NORM_INF) == 0);
+        state = kp.transitionMatrix*state + processNoise;
+    }
+}
+
+#endif
 } // opencv_test
 
 #endif // OPENCV_GAPI_VIDEO_TESTS_INL_HPP
diff --git a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
index 595b63dd1f..5a06671ae3 100644
--- a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #include "../test_precomp.hpp"
@@ -484,6 +484,73 @@ INSTANTIATE_TEST_CASE_P(NormalizeTestCPU, NormalizeTest,
                                 Values(NORM_MINMAX, NORM_INF, NORM_L1, NORM_L2),
                                 Values(-1, CV_8U, CV_16U, CV_16S, CV_32F)));
 
+INSTANTIATE_TEST_CASE_P(KMeansNDNoInitTestCPU, KMeansNDTest,
+                        Combine(Values(CV_32FC1),
+                                Values(cv::Size(2, 20)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(AbsTolerance(0.01).to_compare_obj()),
+                                Values(5),
+                                Values(cv::KMEANS_RANDOM_CENTERS, cv::KMEANS_PP_CENTERS)));
+
+INSTANTIATE_TEST_CASE_P(KMeansNDInitTestCPU, KMeansNDTest,
+                        Combine(Values(CV_32FC1, CV_32FC3),
+                                Values(cv::Size(1, 20),
+                                       cv::Size(2, 20),
+                                       cv::Size(5, 720)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(AbsTolerance(0.01).to_compare_obj()),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS)));
+
+INSTANTIATE_TEST_CASE_P(KMeansNDInitReverseTestCPU, KMeansNDTest,
+                        Combine(Values(CV_32FC3),
+                                Values(cv::Size(20, 1)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(AbsTolerance(0.01).to_compare_obj()),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS)));
+
+INSTANTIATE_TEST_CASE_P(KMeans2DNoInitTestCPU, KMeans2DTest,
+                        Combine(Values(-1),
+                                Values(cv::Size(-1, 20)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(5),
+                                Values(cv::KMEANS_RANDOM_CENTERS, cv::KMEANS_PP_CENTERS)));
+
+INSTANTIATE_TEST_CASE_P(KMeans2DInitTestCPU, KMeans2DTest,
+                        Combine(Values(-1),
+                                Values(cv::Size(-1, 720),
+                                       cv::Size(-1, 20)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS)));
+
+INSTANTIATE_TEST_CASE_P(KMeans3DNoInitTestCPU, KMeans3DTest,
+                        Combine(Values(-1),
+                                Values(cv::Size(-1, 20)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(5),
+                                Values(cv::KMEANS_RANDOM_CENTERS, cv::KMEANS_PP_CENTERS)));
+
+INSTANTIATE_TEST_CASE_P(KMeans3DInitTestCPU, KMeans3DTest,
+                        Combine(Values(-1),
+                                Values(cv::Size(-1, 720),
+                                       cv::Size(-1, 20)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(5, 15),
+                                Values(cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
+                                       cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS)));
+
 // PLEASE DO NOT PUT NEW ACCURACY TESTS BELOW THIS POINT! //////////////////////
 
 INSTANTIATE_TEST_CASE_P(BackendOutputAllocationTestCPU, BackendOutputAllocationTest,
@@ -551,4 +618,11 @@ INSTANTIATE_TEST_CASE_P(SizeRTestCPU, SizeRTest,
                                        cv::Size(640, 320)),
                                 Values(-1),
                                 Values(CORE_CPU)));
+
+INSTANTIATE_TEST_CASE_P(SizeMFTestCPU, SizeMFTest,
+                        Combine(Values(CV_8UC1, CV_8UC3, CV_32FC1),
+                                Values(cv::Size(32, 32),
+                                       cv::Size(640, 320)),
+                                Values(-1),
+                                Values(CORE_CPU)));
 }
diff --git a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
index 8162e01f50..4434f6ebe2 100644
--- a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
@@ -105,7 +105,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest,
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest,
-                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
+                                       CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
+                                       CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
                                 Values(cv::Size(1280, 720),
                                        cv::Size(640, 480),
                                        cv::Size(128, 128)),
@@ -433,12 +435,4 @@ INSTANTIATE_TEST_CASE_P(ReInitOutTestFluid, ReInitOutTest,
                                 Values(CORE_FLUID),
                                 Values(cv::Size(640, 400),
                                        cv::Size(10, 480))));
-
-INSTANTIATE_TEST_CASE_P(CopyTestFluid, CopyTest,
-                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
-                                Values(-1),
-                                Values(CORE_FLUID)));
 }
diff --git a/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
index 884bf0dbae..f3e70c0f9a 100644
--- a/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
@@ -270,7 +270,8 @@ INSTANTIATE_TEST_CASE_P(FindContoursNoOffsetTestCPU, FindContoursNoOffsetTest,
                                 Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(RETR_EXTERNAL),
-                                Values(CHAIN_APPROX_NONE)));
+                                Values(CHAIN_APPROX_NONE),
+                                Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(FindContoursOffsetTestCPU, FindContoursOffsetTest,
                         Values(IMGPROC_CPU));
@@ -282,7 +283,8 @@ INSTANTIATE_TEST_CASE_P(FindContoursHNoOffsetTestCPU, FindContoursHNoOffsetTest,
                                 Values(CV_8UC1),
                                 Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE),
                                 Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
-                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS)));
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(FindContoursHNoOffset32STestCPU, FindContoursHNoOffsetTest,
                         Combine(Values(IMGPROC_CPU),
@@ -291,7 +293,8 @@ INSTANTIATE_TEST_CASE_P(FindContoursHNoOffset32STestCPU, FindContoursHNoOffsetTe
                                 Values(CV_32SC1),
                                 Values(RETR_CCOMP, RETR_FLOODFILL),
                                 Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
-                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS)));
+                                       CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS),
+                                Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(FindContoursHOffsetTestCPU, FindContoursHOffsetTest,
                         Values(IMGPROC_CPU));
@@ -303,23 +306,17 @@ INSTANTIATE_TEST_CASE_P(BoundingRectMatTestCPU, BoundingRectMatTest,
                                        cv::Size(128, 128)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
-                                Values(IoUToleranceRect(0).to_compare_obj())));
+                                Values(IoUToleranceRect(0).to_compare_obj()),
+                                Values(false)));
 
-INSTANTIATE_TEST_CASE_P(BoundingRectMatVector32STestCPU, BoundingRectMatVector32STest,
-                        Combine(Values(-1),
+INSTANTIATE_TEST_CASE_P(BoundingRectMatVectorTestCPU, BoundingRectMatTest,
+                        Combine(Values(CV_32S, CV_32F),
                                 Values(cv::Size(1280, 1),
                                        cv::Size(128, 1)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
-                                Values(IoUToleranceRect(0).to_compare_obj())));
-
- INSTANTIATE_TEST_CASE_P(BoundingRectMatVector32FTestCPU, BoundingRectMatVector32FTest,
-                         Combine(Values(-1),
-                                 Values(cv::Size(1280, 1),
-                                        cv::Size(128, 1)),
-                                 Values(-1),
-                                 Values(IMGPROC_CPU),
-                                 Values(IoUToleranceRect(1e-5).to_compare_obj())));
+                                Values(IoUToleranceRect(1e-5).to_compare_obj()),
+                                Values(true)));
 
 INSTANTIATE_TEST_CASE_P(BoundingRectVector32STestCPU, BoundingRectVector32STest,
                         Combine(Values(-1),
diff --git a/modules/gapi/test/cpu/gapi_stereo_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_stereo_tests_cpu.cpp
new file mode 100644
index 0000000000..5a70b5faea
--- /dev/null
+++ b/modules/gapi/test/cpu/gapi_stereo_tests_cpu.cpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_stereo_tests.hpp"
+
+#include <opencv2/gapi/stereo.hpp> // For ::gapi::stereo::disparity/depth
+#include <opencv2/gapi/cpu/stereo.hpp>
+
+namespace
+{
+#define STEREO_CPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::calib3d::cpu::kernels()}); }
+}  // anonymous namespace
+
+namespace opencv_test
+{
+
+INSTANTIATE_TEST_CASE_P(CPU_Tests, TestGAPIStereo,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::Size(1280, 720)),
+                                Values(CV_32FC1),
+                                Values(STEREO_CPU),
+                                Values(cv::gapi::StereoOutputFormat::DEPTH_FLOAT16,
+                                       cv::gapi::StereoOutputFormat::DEPTH_FLOAT32,
+                                       cv::gapi::StereoOutputFormat::DISPARITY_FIXED16_12_4),
+                                Values(16),
+                                Values(43),
+                                Values(10.),
+                                Values(100.),
+                                Values(AbsExact().to_compare_obj())));
+
+} // opencv_test
diff --git a/modules/gapi/test/cpu/gapi_video_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_video_tests_cpu.cpp
index c4659adf4c..dc343dd686 100644
--- a/modules/gapi/test/cpu/gapi_video_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_video_tests_cpu.cpp
@@ -97,4 +97,42 @@ INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BuildPyr_CalcOptFlow_PipelineInternalTe
                                       Values(15),
                                       Values(3),
                                       Values(true)));
+
+
+INSTANTIATE_TEST_CASE_MACRO_P(WITH_VIDEO(BackgroundSubtractorTestCPU),
+                              BackgroundSubtractorTest,
+                              Combine(Values(VIDEO_CPU),
+                                      Values(std::make_tuple(cv::gapi::video::TYPE_BS_MOG2, 16),
+                                             std::make_tuple(cv::gapi::video::TYPE_BS_MOG2, 8),
+                                             std::make_tuple(cv::gapi::video::TYPE_BS_KNN, 400),
+                                             std::make_tuple(cv::gapi::video::TYPE_BS_KNN, 200)),
+                                             Values(500, 50),
+                                             testing::Bool(),
+                                             Values(-1, 0, 0.5, 1),
+                                             Values("cv/video/768x576.avi"),
+                                             Values(3)));
+
+INSTANTIATE_TEST_CASE_MACRO_P(KalmanFilterTestCPU,
+                              KalmanFilterTest,
+                              Combine(Values(VIDEO_CPU),
+                                      Values(CV_32FC1, CV_64FC1),
+                                      Values(2,5),
+                                      Values(2,5),
+                                      Values(2),
+                                      Values(5)));
+
+INSTANTIATE_TEST_CASE_MACRO_P(KalmanFilterTestCPU,
+                              KalmanFilterNoControlTest,
+                              Combine(Values(VIDEO_CPU),
+                                      Values(CV_32FC1, CV_64FC1),
+                                      Values(3),
+                                      Values(4),
+                                      Values(3)));
+
+INSTANTIATE_TEST_CASE_MACRO_P(KalmanFilterTestCPU,
+                              KalmanFilterCircleSampleTest,
+                              Combine(Values(VIDEO_CPU),
+                                      Values(CV_32FC1, CV_64FC1),
+                                      Values(5)));
+
 } // opencv_test
diff --git a/modules/gapi/test/executor/gtbbexecutor_internal_tests.cpp b/modules/gapi/test/executor/gtbbexecutor_internal_tests.cpp
new file mode 100644
index 0000000000..e1b93af98b
--- /dev/null
+++ b/modules/gapi/test/executor/gtbbexecutor_internal_tests.cpp
@@ -0,0 +1,178 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+// Deliberately include .cpp file instead of header as we use non exported function (execute)
+#include <executor/gtbbexecutor.cpp>
+
+#ifdef HAVE_TBB
+#include <tbb/tbb.h>
+#include <tbb/task.h>
+#if TBB_INTERFACE_VERSION < 12000
+
+#include <tbb/task_arena.h>
+
+#include "../test_precomp.hpp"
+#include <thread>
+
+namespace {
+    tbb::task_arena create_task_arena(int max_concurrency  = tbb::task_arena::automatic /* set to 1 for single thread */) {
+        unsigned int reserved_for_master_threads = 1;
+        if (max_concurrency == 1) {
+            // Leave no room for TBB worker threads, by reserving all to masters.
+            // TBB runtime guarantees that no worker threads will join the arena
+            // if max_concurrency is equal to reserved_for_master_threads
+            // except 1:1 + use of enqueued tasks for safety guarantee.
+            // So deliberately make it 2:2 to force TBB not to create extra thread.
+            //
+            // N.B. one slot will left empty as only one master thread(one that
+            // calls root->wait_for_all()) will join the arena.
+
+            // FIXME: strictly speaking master can take any free slot, not the first one.
+            // However at the moment master seems to pick 0 slot all the time.
+            max_concurrency = 2;
+            reserved_for_master_threads = 2;
+        }
+        return tbb::task_arena{max_concurrency, reserved_for_master_threads};
+    }
+}
+
+namespace opencv_test {
+
+TEST(TBBExecutor, Basic) {
+    using namespace cv::gimpl::parallel;
+    bool executed = false;
+    prio_items_queue_t q;
+    tile_node n([&]() {
+        executed = true;
+    });
+    q.push(&n);
+    execute(q);
+    EXPECT_EQ(true, executed);
+}
+
+TEST(TBBExecutor, SerialExecution) {
+    using namespace cv::gimpl::parallel;
+    const int n = 10;
+    prio_items_queue_t q;
+    std::vector<tile_node> nodes; nodes.reserve(n+1);
+    std::vector<std::thread::id> thread_id(n);
+    for (int i=0; i <n; i++) {
+        nodes.push_back(tile_node([&, i]() {
+                thread_id[i] = std::this_thread::get_id();
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        }));
+        q.push(&nodes.back());
+    }
+
+    auto serial_arena = create_task_arena(1);
+    execute(q, serial_arena);
+    auto print_thread_ids = [&] {
+        std::stringstream str;
+        for (auto& i : thread_id) { str << i <<" ";}
+        return str.str();
+    };
+    EXPECT_NE(thread_id[0], std::thread::id{}) << print_thread_ids();
+    EXPECT_EQ(thread_id.size(), static_cast<size_t>(std::count(thread_id.begin(), thread_id.end(), thread_id[0])))
+        << print_thread_ids();
+}
+
+TEST(TBBExecutor, AsyncBasic) {
+    using namespace cv::gimpl::parallel;
+
+    std::atomic<bool> callback_ready {false};
+    std::function<void()> callback;
+
+    std::atomic<bool> callback_called   {false};
+    std::atomic<bool> master_is_waiting {true};
+    std::atomic<bool> master_was_blocked_until_callback_called {false};
+
+    auto async_thread = std::thread([&] {
+            bool slept = false;
+            while (!callback_ready) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                slept = true;
+            }
+            if (!slept) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
+            callback_called = true;
+            master_was_blocked_until_callback_called = (master_is_waiting == true);
+            callback();
+    });
+
+    auto async_task_body = [&](std::function<void()>&& cb, size_t /*total_order_index*/) {
+        callback = std::move(cb);
+        callback_ready = true;
+    };
+    tile_node n(async, std::move(async_task_body));
+
+    prio_items_queue_t q;
+    q.push(&n);
+    execute(q);
+    master_is_waiting = false;
+
+    async_thread.join();
+
+    EXPECT_EQ(true, callback_called);
+    EXPECT_EQ(true, master_was_blocked_until_callback_called);
+}
+
+TEST(TBBExecutor, Dependencies) {
+    using namespace cv::gimpl::parallel;
+    const int n = 10;
+    bool serial = true;
+    std::atomic<int> counter {0};
+    prio_items_queue_t q;
+    std::vector<tile_node> nodes; nodes.reserve(n+1);
+    const int invalid_order = -10;
+    std::vector<int> tiles_exec_order(n, invalid_order);
+
+    auto add_dependency_to = [](tile_node& node, tile_node& dependency) {
+        dependency.dependants.push_back(&node);
+        node.dependencies++;
+        node.dependency_count.fetch_add(1);
+    };
+    for (int i=0; i <n; i++) {
+        nodes.push_back(tile_node([&, i]() {
+                tiles_exec_order[i] = counter++;
+                if (!serial) {
+                    //sleep gives a better chance for other threads to take part in the execution
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                }
+        }));
+        if (i >0) {
+            auto last_node = nodes.end() - 1;
+            add_dependency_to(*last_node, *(last_node -1));
+        }
+    }
+
+    q.push(&nodes.front());
+
+    auto arena = serial ? create_task_arena(1) : create_task_arena();
+    execute(q, arena);
+    auto print_execution_order = [&] {
+        std::stringstream str;
+        for (auto& i : tiles_exec_order) { str << i <<" ";}
+        return str.str();
+    };
+    ASSERT_EQ(0, std::count(tiles_exec_order.begin(), tiles_exec_order.end(), invalid_order))
+        << "Not all " << n << " task executed ?\n"
+        <<" execution order : " << print_execution_order();
+
+    for (size_t i=0; i <nodes.size(); i++) {
+        auto node_exec_order = tiles_exec_order[i];
+        for (auto* dependee : nodes[i].dependants) {
+            auto index = std::distance(&nodes.front(), dependee);
+            auto dependee_execution_order = tiles_exec_order[index];
+            ASSERT_LT(node_exec_order, dependee_execution_order) << "node number " << index << " is executed earlier than it's dependency " << i;
+        }
+    }
+}
+} // namespace opencv_test
+
+#endif //TBB_INTERFACE_VERSION
+#endif //HAVE_TBB
diff --git a/modules/gapi/test/gapi_array_tests.cpp b/modules/gapi/test/gapi_array_tests.cpp
index 8bdc0854f0..1ae5261d99 100644
--- a/modules/gapi/test/gapi_array_tests.cpp
+++ b/modules/gapi/test/gapi_array_tests.cpp
@@ -32,6 +32,10 @@ G_TYPED_KERNEL(PointIncrement, <GPointArray(GMat, GPointArray)>, "test.point_inc
 {
     static GArrayDesc outMeta(const GMatDesc&, const GArrayDesc&) { return empty_array_desc(); }
 };
+G_TYPED_KERNEL(CountContours, <GOpaque<size_t>(GArray<GPointArray>)>, "test.array.array.in")
+{
+    static GOpaqueDesc outMeta(const GArrayDesc&) { return empty_gopaque_desc(); }
+};
 } // namespace ThisTest
 
 namespace
@@ -70,6 +74,14 @@ GAPI_OCV_KERNEL(OCVPointIncrement, ThisTest::PointIncrement)
     }
 };
 
+GAPI_OCV_KERNEL(OCVCountContours, ThisTest::CountContours)
+{
+    static void run(const std::vector<std::vector<cv::Point>> &contours, size_t &out)
+    {
+        out = contours.size();
+    }
+};
+
 cv::Mat cross(int w, int h)
 {
     cv::Mat mat = cv::Mat::eye(h, w, CV_8UC1)*255;
@@ -177,6 +189,24 @@ TEST(GArray, TestIntermediateOutput)
     EXPECT_EQ(10,  out_count[0]);
 }
 
+TEST(GArray, TestGArrayGArrayKernelInput)
+{
+    cv::GMat in;
+    auto contours = cv::gapi::findContours(in, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_NONE);
+    auto out = ThisTest::CountContours::on(contours);
+    cv::GComputation c(GIn(in), GOut(out));
+
+    // Create input - two filled rectangles
+    cv::Mat in_mat = cv::Mat::zeros(50, 50, CV_8UC1);
+    cv::rectangle(in_mat, cv::Point{5,5},   cv::Point{20,20}, 255, cv::FILLED);
+    cv::rectangle(in_mat, cv::Point{25,25}, cv::Point{40,40}, 255, cv::FILLED);
+
+    size_t out_count = 0u;
+    c.apply(gin(in_mat), gout(out_count), cv::compile_args(cv::gapi::kernels<OCVCountContours>()));
+
+    EXPECT_EQ(2u, out_count) << "Two contours must be found";
+}
+
 TEST(GArray, GArrayConstValInitialization)
 {
     std::vector<cv::Point> initial_vec {Point(0,0), Point(1,1), Point(2,2)};
diff --git a/modules/gapi/test/gapi_async_test.cpp b/modules/gapi/test/gapi_async_test.cpp
index 66b8be4dd3..34a58e90d1 100644
--- a/modules/gapi/test/gapi_async_test.cpp
+++ b/modules/gapi/test/gapi_async_test.cpp
@@ -356,7 +356,12 @@ template<typename case_t>
 struct cancel : ::testing::Test{};
 TYPED_TEST_CASE_P(cancel);
 
-TYPED_TEST_P(cancel, basic){
+TYPED_TEST_P(cancel, basic)
+{
+#if defined(__GNUC__) && __GNUC__ >= 11
+    // std::vector<TypeParam> requests can't handle type with ctor parameter (SelfCanceling)
+    FAIL() << "Test code is not available due to compilation error with GCC 11";
+#else
     constexpr int num_tasks = 100;
     cancel_struct cancel_struct_ {num_tasks};
     std::vector<TypeParam> requests; requests.reserve(num_tasks);
@@ -378,6 +383,7 @@ TYPED_TEST_P(cancel, basic){
         }
     }
     ASSERT_GT(canceled, 0u);
+#endif
 }
 
 namespace {
diff --git a/modules/gapi/test/gapi_fluid_test.cpp b/modules/gapi/test/gapi_fluid_test.cpp
index 11782230cb..29466987d9 100644
--- a/modules/gapi/test/gapi_fluid_test.cpp
+++ b/modules/gapi/test/gapi_fluid_test.cpp
@@ -862,11 +862,11 @@ uint64_t currMemoryConsumption()
     }
     std::string stat_line;
     std::getline(proc_stat, stat_line);
-    uint64_t unused, rss;
-    // using resident set size
-    std::istringstream(stat_line) >> unused >> rss;
-    CV_Assert(rss != 0);
-    return rss;
+    uint64_t unused, data_and_stack;
+    std::istringstream(stat_line) >> unused >> unused >> unused >> unused >> unused
+                                  >> data_and_stack;
+    CV_Assert(data_and_stack != 0);
+    return data_and_stack;
 }
 #else
 // FIXME: implement this part (at least for Windows?), right now it's enough to check Linux only
diff --git a/modules/gapi/test/gapi_gcomputation_tests.cpp b/modules/gapi/test/gapi_gcomputation_tests.cpp
index 47c0257d1e..fa5886eafe 100644
--- a/modules/gapi/test/gapi_gcomputation_tests.cpp
+++ b/modules/gapi/test/gapi_gcomputation_tests.cpp
@@ -81,7 +81,7 @@ namespace opencv_test
 
           void check(const std::vector<cv::Mat>& out_mats)
           {
-              for (const auto& it : ade::util::zip(ref_mats, out_mats))
+              for (const auto it : ade::util::zip(ref_mats, out_mats))
               {
                   const auto& ref_mat = std::get<0>(it);
                   const auto& out_mat = std::get<1>(it);
@@ -91,8 +91,6 @@ namespace opencv_test
           }
       };
 
-      // NB: Check an apply specifically designed to be called from Python,
-      // but can also be used from C++
       struct GComputationPythonApplyTest: public ::testing::Test
       {
           cv::Size sz;
@@ -103,22 +101,28 @@ namespace opencv_test
           GComputationPythonApplyTest() : sz(cv::Size(300,300)), type(CV_8UC1),
           in_mat1(sz, type), in_mat2(sz, type), out_mat_ocv(sz, type),
           m_c([&](){
-                       cv::GMat in1, in2;
-                       cv::GMat out = in1 + in2;
-                       return cv::GComputation(cv::GIn(in1, in2), cv::GOut(out));
-                   })
+                  cv::GMat in1, in2;
+                  cv::GMat out = in1 + in2;
+                  return cv::GComputation(cv::GIn(in1, in2), cv::GOut(out));
+                  })
           {
               cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
               cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
               out_mat_ocv = in_mat1 + in_mat2;
           }
-
       };
   }
 
   TEST_F(GComputationPythonApplyTest, WithoutSerialization)
   {
-      auto output = m_c.apply(cv::gin(in_mat1, in_mat2));
+      auto output = m_c.apply(cv::detail::ExtractArgsCallback{[this](const cv::GTypesInfo& info)
+                                  {
+                                      GAPI_Assert(info[0].shape == cv::GShape::GMAT);
+                                      GAPI_Assert(info[1].shape == cv::GShape::GMAT);
+                                      return cv::GRunArgs{in_mat1, in_mat2};
+                                  }
+                              });
+
       EXPECT_EQ(1u, output.size());
 
       const auto& out_mat_gapi = cv::util::get<cv::Mat>(output[0]);
@@ -130,7 +134,14 @@ namespace opencv_test
       auto p = cv::gapi::serialize(m_c);
       auto c = cv::gapi::deserialize<cv::GComputation>(p);
 
-      auto output = c.apply(cv::gin(in_mat1, in_mat2));
+      auto output = c.apply(cv::detail::ExtractArgsCallback{[this](const cv::GTypesInfo& info)
+                                  {
+                                      GAPI_Assert(info[0].shape == cv::GShape::GMAT);
+                                      GAPI_Assert(info[1].shape == cv::GShape::GMAT);
+                                      return cv::GRunArgs{in_mat1, in_mat2};
+                                  }
+                              });
+
       EXPECT_EQ(1u, output.size());
 
       const auto& out_mat_gapi = cv::util::get<cv::Mat>(output[0]);
diff --git a/modules/gapi/test/gapi_kernel_tests.cpp b/modules/gapi/test/gapi_kernel_tests.cpp
index d57e25110c..dbb0a7f269 100644
--- a/modules/gapi/test/gapi_kernel_tests.cpp
+++ b/modules/gapi/test/gapi_kernel_tests.cpp
@@ -2,8 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
+#include <algorithm>
 
 #include "test_precomp.hpp"
 #include "gapi_mock_kernels.hpp"
@@ -146,6 +147,29 @@ TEST(KernelPackage, Includes)
     EXPECT_FALSE(pkg.includes<J::Qux>());
 }
 
+TEST(KernelPackage, Include)
+{
+    namespace J = Jupiter;
+    auto pkg = cv::gapi::kernels();
+    pkg.include(J::backend(), "test.kernels.foo");
+    pkg.include(J::backend(), "test.kernels.bar");
+    EXPECT_TRUE (pkg.includes<J::Foo>());
+    EXPECT_TRUE (pkg.includes<J::Bar>());
+}
+
+TEST(KernelPackage, GetIds)
+{
+    namespace J = Jupiter;
+    auto pkg = cv::gapi::kernels();
+    pkg.include(J::backend(), "test.kernels.foo");
+    pkg.include(J::backend(), "test.kernels.bar");
+    pkg.include<J::Baz>();
+    auto ids = pkg.get_kernel_ids();
+    EXPECT_NE(ids.end(), std::find(ids.begin(), ids.end(), "test.kernels.foo"));
+    EXPECT_NE(ids.end(), std::find(ids.begin(), ids.end(), "test.kernels.bar"));
+    EXPECT_NE(ids.end(), std::find(ids.begin(), ids.end(), "test.kernels.baz"));
+}
+
 TEST(KernelPackage, IncludesAPI)
 {
     namespace J = Jupiter;
diff --git a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
index 1bd076a480..4e5efe6d3f 100644
--- a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
+++ b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
@@ -359,14 +359,6 @@ INSTANTIATE_TEST_CASE_P(CropTestGPU, CropTest,
                                 Values(CORE_GPU),
                                 Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50))));
 
-INSTANTIATE_TEST_CASE_P(CopyTestGPU, CopyTest,
-                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
-                                Values(-1),
-                                Values(CORE_GPU)));
-
 INSTANTIATE_TEST_CASE_P(LUTTestGPU, LUTTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
                                 Values(cv::Size(1280, 720),
diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
index 547c7c7d33..366b7b18f2 100644
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@@ -15,6 +15,7 @@
 #include <ade/util/iota_range.hpp>
 
 #include <opencv2/gapi/infer/ie.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
 
 #include "backends/ie/util.hpp"
 #include "backends/ie/giebackend/giewrapper.hpp"
@@ -22,6 +23,60 @@
 namespace opencv_test
 {
 namespace {
+void initTestDataPath()
+{
+#ifndef WINRT
+    static bool initialized = false;
+    if (!initialized)
+    {
+        // Since G-API has no own test data (yet), it is taken from the common space
+        const char* testDataPath = getenv("OPENCV_TEST_DATA_PATH");
+        if (testDataPath) {
+            cvtest::addDataSearchPath(testDataPath);
+        }
+        initialized = true;
+    }
+#endif // WINRT
+}
+
+class TestMediaBGR final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_mat;
+    using Cb = cv::MediaFrame::View::Callback;
+    Cb m_cb;
+
+public:
+    explicit TestMediaBGR(cv::Mat m, Cb cb = [](){})
+        : m_mat(m), m_cb(cb) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::BGR, cv::Size(m_mat.cols, m_mat.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = { m_mat.ptr(), nullptr, nullptr, nullptr };
+        cv::MediaFrame::View::Strides ss = { m_mat.step, 0u, 0u, 0u };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss), Cb{m_cb});
+    }
+};
+
+class TestMediaNV12 final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_y;
+    cv::Mat m_uv;
+public:
+    TestMediaNV12(cv::Mat y, cv::Mat uv) : m_y(y), m_uv(uv) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::NV12, cv::Size(m_y.cols, m_y.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = {
+            m_y.ptr(), m_uv.ptr(), nullptr, nullptr
+        };
+        cv::MediaFrame::View::Strides ss = {
+            m_y.step, m_uv.step, 0u, 0u
+        };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss));
+    }
+};
 
 // FIXME: taken from DNN module
 static void initDLDTDataPath()
@@ -64,11 +119,15 @@ void normAssert(cv::InputArray ref, cv::InputArray test,
 
 namespace IE = InferenceEngine;
 
-void setNetParameters(IE::CNNNetwork& net) {
-    auto &ii = net.getInputsInfo().at("data");
+void setNetParameters(IE::CNNNetwork& net, bool is_nv12 = false) {
+    auto ii = net.getInputsInfo().at("data");
     ii->setPrecision(IE::Precision::U8);
     ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
+    if (is_nv12) {
+        ii->getPreProcess().setColorFormat(IE::ColorFormat::NV12);
+    }
 }
+
 } // anonymous namespace
 
 // TODO: Probably DNN/IE part can be further parametrized with a template
@@ -174,6 +233,115 @@ TEST(TestAgeGenderIE, InferBasicImage)
     normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
 }
 
+struct InferWithReshape: public ::testing::Test {
+    cv::gapi::ie::detail::ParamDesc params;
+    cv::Mat m_in_mat;
+    std::vector<cv::Rect> m_roi_list;
+    std::vector<size_t> reshape_dims;
+    std::vector<cv::Mat> m_out_ie_ages;
+    std::vector<cv::Mat> m_out_ie_genders;
+    std::vector<cv::Mat> m_out_gapi_ages;
+    std::vector<cv::Mat> m_out_gapi_genders;
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    InferenceEngine::CNNNetwork net;
+    InferenceEngine::Core plugin;
+
+    InferWithReshape() {
+        // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
+        m_in_mat = cv::Mat(cv::Size(320, 240), CV_8UC3);
+        cv::randu(m_in_mat, 0, 255);
+
+        m_out_gapi_ages.resize(1);
+        m_out_gapi_genders.resize(1);
+
+        // both ROIs point to the same face, with a slightly changed geometry
+        m_roi_list = {
+            cv::Rect(cv::Point{64, 60}, cv::Size{ 96,  96}),
+            cv::Rect(cv::Point{50, 32}, cv::Size{128, 160}),
+        };
+
+        // New dimensions for "data" input
+        reshape_dims = {1, 3, 70, 70};
+
+        initDLDTDataPath();
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+
+        params.device_id = "CPU";
+
+        plugin = cv::gimpl::ie::wrap::getPlugin(params);
+        net    = cv::gimpl::ie::wrap::readNetwork(params);
+        setNetParameters(net);
+        net.reshape({{"data", reshape_dims}});
+    }
+
+    void inferROIs(IE::Blob::Ptr blob) {
+        auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+        auto infer_request = this_network.CreateInferRequest();
+        for (auto &&rc : m_roi_list) {
+            const auto ie_rc = IE::ROI {
+                0u
+                , static_cast<std::size_t>(rc.x)
+                , static_cast<std::size_t>(rc.y)
+                , static_cast<std::size_t>(rc.width)
+                , static_cast<std::size_t>(rc.height)
+            };
+            infer_request.SetBlob("data", IE::make_shared_blob(blob, ie_rc));
+            infer_request.Infer();
+            using namespace cv::gapi::ie::util;
+            m_out_ie_ages.push_back(to_ocv(infer_request.GetBlob("age_conv3")).clone());
+            m_out_ie_genders.push_back(to_ocv(infer_request.GetBlob("prob")).clone());
+        }
+    }
+
+    void infer(cv::Mat& in, const bool with_roi = false) {
+        if (!with_roi) {
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            infer_request.SetBlob("data", cv::gapi::ie::util::to_ie(in));
+            infer_request.Infer();
+            using namespace cv::gapi::ie::util;
+            m_out_ie_ages.push_back(to_ocv(infer_request.GetBlob("age_conv3")).clone());
+            m_out_ie_genders.push_back(to_ocv(infer_request.GetBlob("prob")).clone());
+        } else {
+            auto frame_blob = cv::gapi::ie::util::to_ie(in);
+            inferROIs(frame_blob);
+        }
+    }
+
+    void validate() {
+        // Validate with IE itself (avoid DNN module dependency here)
+        GAPI_Assert(!m_out_gapi_ages.empty());
+        ASSERT_EQ(m_out_gapi_genders.size(), m_out_gapi_ages.size());
+        ASSERT_EQ(m_out_gapi_ages.size(), m_out_ie_ages.size());
+        ASSERT_EQ(m_out_gapi_genders.size(), m_out_ie_genders.size());
+
+        const size_t size = m_out_gapi_ages.size();
+        for (size_t i = 0; i < size; ++i) {
+            normAssert(m_out_ie_ages   [i], m_out_gapi_ages   [i], "Test age output");
+            normAssert(m_out_ie_genders[i], m_out_gapi_genders[i], "Test gender output");
+        }
+    }
+}; // InferWithReshape
+
+struct InferWithReshapeNV12: public InferWithReshape {
+    cv::Mat m_in_uv;
+    cv::Mat m_in_y;
+    void SetUp() {
+        cv::Size sz{320, 240};
+        m_in_y = cv::Mat{sz, CV_8UC1};
+        cv::randu(m_in_y, 0, 255);
+        m_in_uv = cv::Mat{sz / 2, CV_8UC2};
+        cv::randu(m_in_uv, 0, 255);
+        setNetParameters(net, true);
+        net.reshape({{"data", reshape_dims}});
+        auto frame_blob = cv::gapi::ie::util::to_ie(m_in_y, m_in_uv);
+        inferROIs(frame_blob);
+    }
+};
+
 struct ROIList: public ::testing::Test {
     cv::gapi::ie::detail::ParamDesc params;
 
@@ -246,6 +414,201 @@ struct ROIList: public ::testing::Test {
     }
 }; // ROIList
 
+struct ROIListNV12: public ::testing::Test {
+    cv::gapi::ie::detail::ParamDesc params;
+
+    cv::Mat m_in_uv;
+    cv::Mat m_in_y;
+    std::vector<cv::Rect> m_roi_list;
+
+    std::vector<cv::Mat> m_out_ie_ages;
+    std::vector<cv::Mat> m_out_ie_genders;
+
+    std::vector<cv::Mat> m_out_gapi_ages;
+    std::vector<cv::Mat> m_out_gapi_genders;
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    void SetUp() {
+        initDLDTDataPath();
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.device_id = "CPU";
+
+        cv::Size sz{320, 240};
+        m_in_y = cv::Mat{sz, CV_8UC1};
+        cv::randu(m_in_y, 0, 255);
+        m_in_uv = cv::Mat{sz / 2, CV_8UC2};
+        cv::randu(m_in_uv, 0, 255);
+
+        // both ROIs point to the same face, with a slightly changed geometry
+        m_roi_list = {
+            cv::Rect(cv::Point{64, 60}, cv::Size{ 96,  96}),
+            cv::Rect(cv::Point{50, 32}, cv::Size{128, 160}),
+        };
+
+        // Load & run IE network
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net, true);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            auto frame_blob = cv::gapi::ie::util::to_ie(m_in_y, m_in_uv);
+
+            for (auto &&rc : m_roi_list) {
+                const auto ie_rc = IE::ROI {
+                    0u
+                        , static_cast<std::size_t>(rc.x)
+                        , static_cast<std::size_t>(rc.y)
+                        , static_cast<std::size_t>(rc.width)
+                        , static_cast<std::size_t>(rc.height)
+                };
+                infer_request.SetBlob("data", IE::make_shared_blob(frame_blob, ie_rc));
+                infer_request.Infer();
+
+                using namespace cv::gapi::ie::util;
+                m_out_ie_ages.push_back(to_ocv(infer_request.GetBlob("age_conv3")).clone());
+                m_out_ie_genders.push_back(to_ocv(infer_request.GetBlob("prob")).clone());
+            }
+        } // namespace IE = ..
+    } // ROIList()
+
+    void validate() {
+        // Validate with IE itself (avoid DNN module dependency here)
+        ASSERT_EQ(2u, m_out_ie_ages.size());
+        ASSERT_EQ(2u, m_out_ie_genders.size());
+        ASSERT_EQ(2u, m_out_gapi_ages.size());
+        ASSERT_EQ(2u, m_out_gapi_genders.size());
+
+        normAssert(m_out_ie_ages   [0], m_out_gapi_ages   [0], "0: Test age output");
+        normAssert(m_out_ie_genders[0], m_out_gapi_genders[0], "0: Test gender output");
+        normAssert(m_out_ie_ages   [1], m_out_gapi_ages   [1], "1: Test age output");
+        normAssert(m_out_ie_genders[1], m_out_gapi_genders[1], "1: Test gender output");
+    }
+};
+
+struct SingleROI: public ::testing::Test {
+    cv::gapi::ie::detail::ParamDesc params;
+
+    cv::Mat m_in_mat;
+    cv::Rect m_roi;
+
+    cv::Mat m_out_gapi_age;
+    cv::Mat m_out_gapi_gender;
+
+    cv::Mat m_out_ie_age;
+    cv::Mat m_out_ie_gender;
+
+    void SetUp() {
+        initDLDTDataPath();
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.device_id = "CPU";
+
+        // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
+        m_in_mat = cv::Mat(cv::Size(320, 240), CV_8UC3);
+        cv::randu(m_in_mat, 0, 255);
+
+        m_roi = cv::Rect(cv::Point{64, 60}, cv::Size{96, 96});
+
+        // Load & run IE network
+        IE::Blob::Ptr ie_age, ie_gender;
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+
+            const auto ie_rc = IE::ROI {
+                    0u
+                    , static_cast<std::size_t>(m_roi.x)
+                    , static_cast<std::size_t>(m_roi.y)
+                    , static_cast<std::size_t>(m_roi.width)
+                    , static_cast<std::size_t>(m_roi.height)
+            };
+
+            IE::Blob::Ptr roi_blob = IE::make_shared_blob(cv::gapi::ie::util::to_ie(m_in_mat), ie_rc);
+            infer_request.SetBlob("data", roi_blob);
+            infer_request.Infer();
+
+            using namespace cv::gapi::ie::util;
+            m_out_ie_age    = to_ocv(infer_request.GetBlob("age_conv3")).clone();
+            m_out_ie_gender = to_ocv(infer_request.GetBlob("prob")).clone();
+        }
+    }
+
+    void validate() {
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(m_out_ie_age   , m_out_gapi_age   , "Test age output");
+        normAssert(m_out_ie_gender, m_out_gapi_gender, "Test gender output");
+    }
+};
+
+struct SingleROINV12: public ::testing::Test {
+    cv::gapi::ie::detail::ParamDesc params;
+
+    cv::Mat m_in_y;
+    cv::Mat m_in_uv;
+    cv::Rect m_roi;
+
+    cv::Mat m_out_gapi_age;
+    cv::Mat m_out_gapi_gender;
+
+    cv::Mat m_out_ie_age;
+    cv::Mat m_out_ie_gender;
+
+    void SetUp() {
+        initDLDTDataPath();
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.device_id = "CPU";
+
+        cv::Size sz{320, 240};
+        m_in_y = cv::Mat{sz, CV_8UC1};
+        cv::randu(m_in_y, 0, 255);
+        m_in_uv = cv::Mat{sz / 2, CV_8UC2};
+        cv::randu(m_in_uv, 0, 255);
+
+        m_roi = cv::Rect(cv::Point{64, 60}, cv::Size{96, 96});
+
+        // Load & run IE network
+        IE::Blob::Ptr ie_age, ie_gender;
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net, /* NV12 */ true);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            auto blob = cv::gapi::ie::util::to_ie(m_in_y, m_in_uv);
+
+            const auto ie_rc = IE::ROI {
+                    0u
+                    , static_cast<std::size_t>(m_roi.x)
+                    , static_cast<std::size_t>(m_roi.y)
+                    , static_cast<std::size_t>(m_roi.width)
+                    , static_cast<std::size_t>(m_roi.height)
+            };
+
+            IE::Blob::Ptr roi_blob = IE::make_shared_blob(blob, ie_rc);
+            infer_request.SetBlob("data", roi_blob);
+            infer_request.Infer();
+
+            using namespace cv::gapi::ie::util;
+            m_out_ie_age    = to_ocv(infer_request.GetBlob("age_conv3")).clone();
+            m_out_ie_gender = to_ocv(infer_request.GetBlob("prob")).clone();
+        }
+    }
+
+    void validate() {
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(m_out_ie_age   , m_out_gapi_age   , "Test age output");
+        normAssert(m_out_ie_gender, m_out_gapi_gender, "Test gender output");
+    }
+};
+
 TEST_F(ROIList, TestInfer)
 {
     cv::GArray<cv::Rect> rr;
@@ -258,8 +621,8 @@ TEST_F(ROIList, TestInfer)
         params.model_path, params.weights_path, params.device_id
     }.cfgOutputLayers({ "age_conv3", "prob" });
     comp.apply(cv::gin(m_in_mat, m_roi_list),
-               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
-               cv::compile_args(cv::gapi::networks(pp)));
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
     validate();
 }
 
@@ -390,10 +753,8 @@ TEST(TestAgeGenderIE, GenericInfer)
 
     cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
 
-    cv::gapi::ie::Params<cv::gapi::Generic> pp{"age-gender-generic",
-                                                params.model_path,
-                                                params.weights_path,
-                                                params.device_id};
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id};
 
     comp.apply(cv::gin(in_mat), cv::gout(gapi_age, gapi_gender),
                cv::compile_args(cv::gapi::networks(pp)));
@@ -421,10 +782,9 @@ TEST(TestAgeGenderIE, InvalidConfigGeneric)
     auto gender  = outputs.at("prob");
     cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
 
-    auto pp = cv::gapi::ie::Params<cv::gapi::Generic>{"age-gender-generic",
-                                                       model_path,
-                                                       weights_path,
-                                                       device_id}.pluginConfig({{"unsupported_config", "some_value"}});
+    auto pp = cv::gapi::ie::Params<cv::gapi::Generic>{
+        "age-gender-generic", model_path, weights_path, device_id
+    }.pluginConfig({{"unsupported_config", "some_value"}});
 
     EXPECT_ANY_THROW(comp.compile(cv::GMatDesc{CV_8U,3,cv::Size{320, 240}},
                      cv::compile_args(cv::gapi::networks(pp))));
@@ -448,10 +808,10 @@ TEST(TestAgeGenderIE, CPUConfigGeneric)
     auto gender  = outputs.at("prob");
     cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
 
-    auto pp = cv::gapi::ie::Params<cv::gapi::Generic>{"age-gender-generic",
-                                                       model_path,
-                                                       weights_path,
-                                                       device_id}.pluginConfig({{"ENFORCE_BF16", "NO"}});
+    auto pp = cv::gapi::ie::Params<cv::gapi::Generic> {
+        "age-gender-generic", model_path, weights_path, device_id
+    }.pluginConfig({{IE::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS,
+                     IE::PluginConfigParams::CPU_THROUGHPUT_NUMA}});
 
     EXPECT_NO_THROW(comp.compile(cv::GMatDesc{CV_8U,3,cv::Size{320, 240}},
                     cv::compile_args(cv::gapi::networks(pp))));
@@ -475,7 +835,8 @@ TEST(TestAgeGenderIE, InvalidConfig)
 
     auto pp = cv::gapi::ie::Params<AgeGender> {
         model_path, weights_path, device_id
-    }.cfgOutputLayers({ "age_conv3", "prob" }).pluginConfig({{"unsupported_config", "some_value"}});
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .pluginConfig({{"unsupported_config", "some_value"}});
 
     EXPECT_ANY_THROW(comp.compile(cv::GMatDesc{CV_8U,3,cv::Size{320, 240}},
                      cv::compile_args(cv::gapi::networks(pp))));
@@ -499,12 +860,1151 @@ TEST(TestAgeGenderIE, CPUConfig)
 
     auto pp = cv::gapi::ie::Params<AgeGender> {
         model_path, weights_path, device_id
-    }.cfgOutputLayers({ "age_conv3", "prob" }).pluginConfig({{"ENFORCE_BF16", "NO"}});
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .pluginConfig({{IE::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS,
+                     IE::PluginConfigParams::CPU_THROUGHPUT_NUMA}});
 
     EXPECT_NO_THROW(comp.compile(cv::GMatDesc{CV_8U,3,cv::Size{320, 240}},
                     cv::compile_args(cv::gapi::networks(pp))));
 }
 
+TEST_F(ROIList, MediaInputBGR)
+{
+    initDLDTDataPath();
+
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(ROIListNV12, MediaInputNV12)
+{
+    initDLDTDataPath();
+
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST(TestAgeGenderIE, MediaInputNV12)
+{
+    initDLDTDataPath();
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    cv::Size sz{320, 240};
+    cv::Mat in_y_mat(sz, CV_8UC1);
+    cv::randu(in_y_mat, 0, 255);
+    cv::Mat in_uv_mat(sz / 2, CV_8UC2);
+    cv::randu(in_uv_mat, 0, 255);
+
+    cv::Mat gapi_age, gapi_gender;
+
+    // Load & run IE network
+    IE::Blob::Ptr ie_age, ie_gender;
+    {
+        auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+        auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+        setNetParameters(net, true);
+        auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+        auto infer_request = this_network.CreateInferRequest();
+        infer_request.SetBlob("data", cv::gapi::ie::util::to_ie(in_y_mat, in_uv_mat));
+        infer_request.Infer();
+        ie_age    = infer_request.GetBlob("age_conv3");
+        ie_gender = infer_request.GetBlob("prob");
+    }
+
+    // Configure & run G-API
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GFrame in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(in_y_mat, in_uv_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame), cv::gout(gapi_age, gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+
+    // Validate with IE itself (avoid DNN module dependency here)
+    normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+    normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+}
+
+TEST(TestAgeGenderIE, MediaInputBGR)
+{
+    initDLDTDataPath();
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    cv::Size sz{320, 240};
+    cv::Mat in_mat(sz, CV_8UC3);
+    cv::randu(in_mat, 0, 255);
+
+    cv::Mat gapi_age, gapi_gender;
+
+    // Load & run IE network
+    IE::Blob::Ptr ie_age, ie_gender;
+    {
+        auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+        auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+        setNetParameters(net);
+        auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+        auto infer_request = this_network.CreateInferRequest();
+        infer_request.SetBlob("data", cv::gapi::ie::util::to_ie(in_mat));
+        infer_request.Infer();
+        ie_age    = infer_request.GetBlob("age_conv3");
+        ie_gender = infer_request.GetBlob("prob");
+    }
+
+    // Configure & run G-API
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GFrame in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame), cv::gout(gapi_age, gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+
+    // Validate with IE itself (avoid DNN module dependency here)
+    normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+    normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+}
+
+TEST(InferROI, MediaInputBGR)
+{
+    initDLDTDataPath();
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    cv::Size sz{320, 240};
+    cv::Mat in_mat(sz, CV_8UC3);
+    cv::randu(in_mat, 0, 255);
+
+    cv::Mat gapi_age, gapi_gender;
+    cv::Rect rect(cv::Point{64, 60}, cv::Size{96, 96});
+
+    // Load & run IE network
+    IE::Blob::Ptr ie_age, ie_gender;
+    {
+        auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+        auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+        setNetParameters(net);
+        auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+        auto infer_request = this_network.CreateInferRequest();
+        const auto ie_rc = IE::ROI {
+            0u
+            , static_cast<std::size_t>(rect.x)
+            , static_cast<std::size_t>(rect.y)
+            , static_cast<std::size_t>(rect.width)
+            , static_cast<std::size_t>(rect.height)
+        };
+        IE::Blob::Ptr roi_blob = IE::make_shared_blob(cv::gapi::ie::util::to_ie(in_mat), ie_rc);
+        infer_request.SetBlob("data", roi_blob);
+        infer_request.Infer();
+        ie_age    = infer_request.GetBlob("age_conv3");
+        ie_gender = infer_request.GetBlob("prob");
+    }
+
+    // Configure & run G-API
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(roi, in);
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, rect), cv::gout(gapi_age, gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+
+    // Validate with IE itself (avoid DNN module dependency here)
+    normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+    normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+}
+
+TEST(InferROI, MediaInputNV12)
+{
+    initDLDTDataPath();
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    cv::Size sz{320, 240};
+    auto in_y_mat = cv::Mat{sz, CV_8UC1};
+    cv::randu(in_y_mat, 0, 255);
+    auto in_uv_mat = cv::Mat{sz / 2, CV_8UC2};
+    cv::randu(in_uv_mat, 0, 255);
+
+    cv::Mat gapi_age, gapi_gender;
+    cv::Rect rect(cv::Point{64, 60}, cv::Size{96, 96});
+
+    // Load & run IE network
+    IE::Blob::Ptr ie_age, ie_gender;
+    {
+        auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+        auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+        setNetParameters(net, true);
+        auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+        auto infer_request = this_network.CreateInferRequest();
+        const auto ie_rc = IE::ROI {
+            0u
+            , static_cast<std::size_t>(rect.x)
+            , static_cast<std::size_t>(rect.y)
+            , static_cast<std::size_t>(rect.width)
+            , static_cast<std::size_t>(rect.height)
+        };
+        IE::Blob::Ptr roi_blob = IE::make_shared_blob(cv::gapi::ie::util::to_ie(in_y_mat, in_uv_mat), ie_rc);
+        infer_request.SetBlob("data", roi_blob);
+        infer_request.Infer();
+        ie_age    = infer_request.GetBlob("age_conv3");
+        ie_gender = infer_request.GetBlob("prob");
+    }
+
+    // Configure & run G-API
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(roi, in);
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(in_y_mat, in_uv_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, rect), cv::gout(gapi_age, gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+
+    // Validate with IE itself (avoid DNN module dependency here)
+    normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+    normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+}
+
+TEST_F(ROIList, Infer2MediaInputBGR)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GFrame in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer2<AgeGender>(in, rr);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    validate();
+}
+
+TEST_F(ROIListNV12, Infer2MediaInputNV12)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GFrame in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer2<AgeGender>(in, rr);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    validate();
+}
+
+TEST_F(SingleROI, GenericInfer)
+{
+    // Configure & run G-API
+    cv::GMat in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", roi, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    comp.apply(cv::gin(m_in_mat, m_roi), cv::gout(m_out_gapi_age, m_out_gapi_gender),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(SingleROI, GenericInferMediaBGR)
+{
+    // Configure & run G-API
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", roi, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+    comp.apply(cv::gin(frame, m_roi), cv::gout(m_out_gapi_age, m_out_gapi_gender),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(SingleROINV12, GenericInferMediaNV12)
+{
+    // Configure & run G-API
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", roi, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    comp.apply(cv::gin(frame, m_roi), cv::gout(m_out_gapi_age, m_out_gapi_gender),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(ROIList, GenericInfer)
+{
+    cv::GMat in;
+    cv::GArray<cv::Rect> rr;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", rr, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    comp.apply(cv::gin(m_in_mat, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(ROIList, GenericInferMediaBGR)
+{
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", rr, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+    comp.apply(cv::gin(frame, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(ROIListNV12, GenericInferMediaNV12)
+{
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>("age-gender-generic", rr, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    comp.apply(cv::gin(frame, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(ROIList, GenericInfer2)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    GInferListInputs list;
+    list["data"] = rr;
+
+    auto outputs = cv::gapi::infer2<cv::gapi::Generic>("age-gender-generic", in, list);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    comp.apply(cv::gin(m_in_mat, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+    validate();
+}
+
+TEST_F(ROIList, GenericInfer2MediaInputBGR)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GFrame in;
+    GInferListInputs inputs;
+    inputs["data"] = rr;
+
+    auto outputs = cv::gapi::infer2<cv::gapi::Generic>("age-gender-generic", in, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+    comp.apply(cv::gin(frame, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+    validate();
+}
+
+TEST_F(ROIListNV12, GenericInfer2MediaInputNV12)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GFrame in;
+    GInferListInputs inputs;
+    inputs["data"] = rr;
+
+    auto outputs = cv::gapi::infer2<cv::gapi::Generic>("age-gender-generic", in, inputs);
+    auto age     = outputs.at("age_conv3");
+    auto gender  = outputs.at("prob");
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    cv::gapi::ie::Params<cv::gapi::Generic> pp{
+        "age-gender-generic", params.model_path, params.weights_path, params.device_id
+    };
+    pp.cfgNumRequests(2u);
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    comp.apply(cv::gin(frame, m_roi_list),
+            cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+            cv::compile_args(cv::gapi::networks(pp)));
+    validate();
+}
+
+TEST(Infer, SetInvalidNumberOfRequests)
+{
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::gapi::ie::Params<AgeGender> pp{"model", "weights", "device"};
+
+    EXPECT_ANY_THROW(pp.cfgNumRequests(0u));
+}
+
+TEST(Infer, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    cv::Mat gapi_age, gapi_gender;
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GMat in;
+    cv::GMat age, gender;
+
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource<cv::gapi::wip::GCaptureSource>(filepath);
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_age, gapi_gender)))
+    {
+        IE::Blob::Ptr ie_age, ie_gender;
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+
+            infer_request.SetBlob("data", cv::gapi::ie::util::to_ie(in_mat));
+            infer_request.Infer();
+            ie_age    = infer_request.GetBlob("age_conv3");
+            ie_gender = infer_request.GetBlob("prob");
+        }
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+        normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+        ++num_frames;
+        cap >> in_mat;
+    }
+    pipeline.stop();
+}
+
+TEST(InferROI, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    cv::Mat gapi_age, gapi_gender;
+    cv::Rect rect(cv::Point{64, 60}, cv::Size{96, 96});
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GMat in;
+    cv::GOpaque<cv::Rect> roi;
+    cv::GMat age, gender;
+
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(roi, in);
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource(
+            cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(filepath), rect));
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_age, gapi_gender)))
+    {
+        // Load & run IE network
+        IE::Blob::Ptr ie_age, ie_gender;
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            const auto ie_rc = IE::ROI {
+                0u
+                , static_cast<std::size_t>(rect.x)
+                , static_cast<std::size_t>(rect.y)
+                , static_cast<std::size_t>(rect.width)
+                , static_cast<std::size_t>(rect.height)
+            };
+            IE::Blob::Ptr roi_blob = IE::make_shared_blob(cv::gapi::ie::util::to_ie(in_mat), ie_rc);
+            infer_request.SetBlob("data", roi_blob);
+            infer_request.Infer();
+            ie_age    = infer_request.GetBlob("age_conv3");
+            ie_gender = infer_request.GetBlob("prob");
+        }
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(cv::gapi::ie::util::to_ocv(ie_age),    gapi_age,    "Test age output"   );
+        normAssert(cv::gapi::ie::util::to_ocv(ie_gender), gapi_gender, "Test gender output");
+        ++num_frames;
+        cap >> in_mat;
+    }
+    pipeline.stop();
+}
+
+TEST(InferList, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    std::vector<cv::Mat> ie_ages, ie_genders, gapi_ages, gapi_genders;
+
+    std::vector<cv::Rect> roi_list = {
+        cv::Rect(cv::Point{64, 60}, cv::Size{ 96,  96}),
+        cv::Rect(cv::Point{50, 32}, cv::Size{128, 160}),
+    };
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GMat in;
+    cv::GArray<cv::Rect> roi;
+    cv::GArray<GMat> age, gender;
+
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(roi, in);
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource(
+            cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(filepath), roi_list));
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_ages, gapi_genders)))
+    {
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            auto frame_blob = cv::gapi::ie::util::to_ie(in_mat);
+
+            for (auto &&rc : roi_list) {
+                const auto ie_rc = IE::ROI {
+                    0u
+                    , static_cast<std::size_t>(rc.x)
+                    , static_cast<std::size_t>(rc.y)
+                    , static_cast<std::size_t>(rc.width)
+                    , static_cast<std::size_t>(rc.height)
+                };
+                infer_request.SetBlob("data", IE::make_shared_blob(frame_blob, ie_rc));
+                infer_request.Infer();
+
+                using namespace cv::gapi::ie::util;
+                ie_ages.push_back(to_ocv(infer_request.GetBlob("age_conv3")).clone());
+                ie_genders.push_back(to_ocv(infer_request.GetBlob("prob")).clone());
+            }
+        } // namespace IE = ..
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(ie_ages   [0], gapi_ages   [0], "0: Test age output");
+        normAssert(ie_genders[0], gapi_genders[0], "0: Test gender output");
+        normAssert(ie_ages   [1], gapi_ages   [1], "1: Test age output");
+        normAssert(ie_genders[1], gapi_genders[1], "1: Test gender output");
+
+        ie_ages.clear();
+        ie_genders.clear();
+
+        ++num_frames;
+        cap >> in_mat;
+    }
+}
+
+TEST(Infer2, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    std::vector<cv::Mat> ie_ages, ie_genders, gapi_ages, gapi_genders;
+
+    std::vector<cv::Rect> roi_list = {
+        cv::Rect(cv::Point{64, 60}, cv::Size{ 96,  96}),
+        cv::Rect(cv::Point{50, 32}, cv::Size{128, 160}),
+    };
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer2<AgeGender>(in, rr);
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource(
+            cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(filepath), roi_list));
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_ages, gapi_genders)))
+    {
+        {
+            auto plugin        = cv::gimpl::ie::wrap::getPlugin(params);
+            auto net           = cv::gimpl::ie::wrap::readNetwork(params);
+            setNetParameters(net);
+            auto this_network  = cv::gimpl::ie::wrap::loadNetwork(plugin, net, params);
+            auto infer_request = this_network.CreateInferRequest();
+            auto frame_blob = cv::gapi::ie::util::to_ie(in_mat);
+
+            for (auto &&rc : roi_list) {
+                const auto ie_rc = IE::ROI {
+                    0u
+                    , static_cast<std::size_t>(rc.x)
+                    , static_cast<std::size_t>(rc.y)
+                    , static_cast<std::size_t>(rc.width)
+                    , static_cast<std::size_t>(rc.height)
+                };
+                infer_request.SetBlob("data", IE::make_shared_blob(frame_blob, ie_rc));
+                infer_request.Infer();
+
+                using namespace cv::gapi::ie::util;
+                ie_ages.push_back(to_ocv(infer_request.GetBlob("age_conv3")).clone());
+                ie_genders.push_back(to_ocv(infer_request.GetBlob("prob")).clone());
+            }
+        } // namespace IE = ..
+        // Validate with IE itself (avoid DNN module dependency here)
+        normAssert(ie_ages   [0], gapi_ages   [0], "0: Test age output");
+        normAssert(ie_genders[0], gapi_genders[0], "0: Test gender output");
+        normAssert(ie_ages   [1], gapi_ages   [1], "1: Test age output");
+        normAssert(ie_genders[1], gapi_genders[1], "1: Test gender output");
+
+        ie_ages.clear();
+        ie_genders.clear();
+
+        ++num_frames;
+        cap >> in_mat;
+    }
+    pipeline.stop();
+}
+
+TEST(InferEmptyList, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    std::vector<cv::Mat> ie_ages, ie_genders, gapi_ages, gapi_genders;
+
+    // NB: Empty list of roi
+    std::vector<cv::Rect> roi_list;
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GMat in;
+    cv::GArray<cv::Rect> roi;
+    cv::GArray<GMat> age, gender;
+
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(roi, in);
+    cv::GComputation comp(cv::GIn(in, roi), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 1u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource(
+            cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(filepath), roi_list));
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_ages, gapi_genders)))
+    {
+        EXPECT_TRUE(gapi_ages.empty());
+        EXPECT_TRUE(gapi_genders.empty());
+    }
+}
+
+TEST(Infer2EmptyList, TestStreamingInfer)
+{
+    initTestDataPath();
+    initDLDTDataPath();
+
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::gapi::ie::detail::ParamDesc params;
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.device_id = "CPU";
+
+    // Load IE network, initialize input data using that.
+    cv::Mat in_mat;
+    std::vector<cv::Mat> ie_ages, ie_genders, gapi_ages, gapi_genders;
+
+    // NB: Empty list of roi
+    std::vector<cv::Rect> roi_list;
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "test-age-gender");
+
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer2<AgeGender>(in, rr);
+
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgNumRequests(4u);
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 1u;
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cap >> in_mat;
+    auto pipeline = comp.compileStreaming(cv::compile_args(cv::gapi::networks(pp)));
+    pipeline.setSource(
+            cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(filepath), roi_list));
+
+    pipeline.start();
+    while (num_frames < max_frames && pipeline.pull(cv::gout(gapi_ages, gapi_genders)))
+    {
+        EXPECT_TRUE(gapi_ages.empty());
+        EXPECT_TRUE(gapi_genders.empty());
+    }
+}
+
+TEST_F(InferWithReshape, TestInfer)
+{
+    // IE code
+    infer(m_in_mat);
+    // G-API code
+    cv::GMat in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({{"data", reshape_dims}});
+    comp.apply(cv::gin(m_in_mat), cv::gout(m_out_gapi_ages.front(), m_out_gapi_genders.front()),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshape, TestInferInImage)
+{
+    // Input image already has 70x70 size
+    cv::Mat rsz;
+    cv::resize(m_in_mat, rsz, cv::Size(70, 70));
+    // IE code
+    infer(rsz);
+    // G-API code
+    cv::GMat in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({"data"});
+    // Reshape CNN input by input image size
+    comp.apply(cv::gin(rsz), cv::gout(m_out_gapi_ages.front(), m_out_gapi_genders.front()),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshape, TestInferForSingleLayer)
+{
+    // IE code
+    infer(m_in_mat);
+    // G-API code
+    cv::GMat in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" })
+     .cfgInputReshape("data", reshape_dims);
+    comp.apply(cv::gin(m_in_mat), cv::gout(m_out_gapi_ages.front(), m_out_gapi_genders.front()),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshape, TestInferList)
+{
+    // IE code
+    infer(m_in_mat, true);
+    // G-API code
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({{"data", reshape_dims}});
+    comp.apply(cv::gin(m_in_mat, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshape, TestInferList2)
+{
+    // IE code
+    infer(m_in_mat, true);
+    // G-API code
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer2<AgeGender>(in, rr);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({{"data", reshape_dims}});
+    comp.apply(cv::gin(m_in_mat, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshape, TestInferListBGR)
+{
+    // IE code
+    infer(m_in_mat, true);
+    // G-API code
+    cv::GArray<cv::Rect> rr;
+    cv::GFrame in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({{"data", reshape_dims}});
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
+
+TEST_F(InferWithReshapeNV12, TestInferListYUV)
+{
+    // G-API code
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" }).cfgInputReshape({{"data", reshape_dims}});
+    comp.apply(cv::gin(frame, m_roi_list),
+               cv::gout(m_out_gapi_ages, m_out_gapi_genders),
+               cv::compile_args(cv::gapi::networks(pp)));
+    // Validate
+    validate();
+}
 } // namespace opencv_test
 
 #endif //  HAVE_INF_ENGINE
diff --git a/modules/gapi/test/infer/gapi_infer_onnx_test.cpp b/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
index 782e1b093a..ef192b9d6a 100644
--- a/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
@@ -9,13 +9,53 @@
 #ifdef HAVE_ONNX
 
 #include <stdexcept>
+#include <codecvt> // wstring_convert
+
 #include <onnxruntime_cxx_api.h>
 #include <ade/util/iota_range.hpp>
-
+#include <ade/util/algorithm.hpp>
 #include <opencv2/gapi/own/convert.hpp>
 #include <opencv2/gapi/infer/onnx.hpp>
 
 namespace {
+class TestMediaBGR final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_mat;
+    using Cb = cv::MediaFrame::View::Callback;
+    Cb m_cb;
+
+public:
+    explicit TestMediaBGR(cv::Mat m, Cb cb = [](){})
+        : m_mat(m), m_cb(cb) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::BGR, cv::Size(m_mat.cols, m_mat.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = { m_mat.ptr(), nullptr, nullptr, nullptr };
+        cv::MediaFrame::View::Strides ss = { m_mat.step, 0u, 0u, 0u };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss), Cb{m_cb});
+    }
+};
+
+class TestMediaNV12 final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_y;
+    cv::Mat m_uv;
+public:
+    TestMediaNV12(cv::Mat y, cv::Mat uv) : m_y(y), m_uv(uv) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::NV12, cv::Size(m_y.cols, m_y.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = {
+            m_y.ptr(), m_uv.ptr(), nullptr, nullptr
+        };
+        cv::MediaFrame::View::Strides ss = {
+            m_y.step, m_uv.step, 0u, 0u
+        };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss));
+    }
+};
 struct ONNXInitPath {
     ONNXInitPath() {
         const char* env_path = getenv("OPENCV_GAPI_ONNX_MODEL_PATH");
@@ -43,8 +83,24 @@ cv::Mat initMatrixRandU(const int type, const cv::Size& sz_in) {
 namespace opencv_test
 {
 namespace {
+void initTestDataPath()
+{
+#ifndef WINRT
+    static bool initialized = false;
+    if (!initialized)
+    {
+        // Since G-API has no own test data (yet), it is taken from the common space
+        const char* testDataPath = getenv("OPENCV_TEST_DATA_PATH");
+        if (testDataPath) {
+            cvtest::addDataSearchPath(testDataPath);
+        }
+        initialized = true;
+    }
+#endif // WINRT
+}
+
 // FIXME: taken from the DNN module
-void normAssert(const cv::InputArray& ref, const cv::InputArray& test,
+void normAssert(cv::InputArray& ref, cv::InputArray& test,
                 const char *comment /*= ""*/,
                 const double l1 = 0.00001, const double lInf = 0.0001) {
     const double normL1 = cvtest::norm(ref, test, cv::NORM_L1) / ref.getMat().total();
@@ -67,67 +123,194 @@ inline void toCHW(const cv::Mat& src, cv::Mat& dst) {
     cv::split(src, planes);
 }
 
-inline int toCV(const ONNXTensorElementDataType prec) {
+inline int toCV(ONNXTensorElementDataType prec) {
     switch (prec) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return CV_8U;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return CV_32F;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return CV_32S;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return CV_32S;
     default: GAPI_Assert(false && "Unsupported data type");
     }
     return -1;
 }
 
+void copyFromONNX(Ort::Value &v, cv::Mat& mat) {
+    const auto info = v.GetTensorTypeAndShapeInfo();
+    const auto prec = info.GetElementType();
+    const auto shape = info.GetShape();
+    const std::vector<int> dims(shape.begin(), shape.end());
+    mat.create(dims, toCV(prec));
+    switch (prec) {
+#define HANDLE(E,T)                                          \
+        case E: std::copy_n(v.GetTensorMutableData<T>(),     \
+                            mat.total(),                     \
+                            reinterpret_cast<T*>(mat.data)); \
+            break;
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t);
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float);
+        HANDLE(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int);
+#undef HANDLE
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+            const auto o_ptr = v.GetTensorMutableData<int64_t>();
+            const auto g_ptr = reinterpret_cast<int*>(mat.data);
+            std::transform(o_ptr, o_ptr + mat.total(), g_ptr,
+                   [](int64_t el) { return static_cast<int>(el); });
+            break;
+        }
+    default: GAPI_Assert(false && "ONNX. Unsupported data type");
+    }
+}
+
 inline std::vector<int64_t> toORT(const cv::MatSize &sz) {
     return cv::to_own<int64_t>(sz);
 }
 
 inline std::vector<const char*> getCharNames(const std::vector<std::string>& names) {
-    std::vector<const char*> out_vec;
-    for (const auto& el : names) {
-        out_vec.push_back(el.data());
-    }
-    return out_vec;
+    std::vector<const char*> out_ptrs;
+    out_ptrs.reserve(names.size());
+    ade::util::transform(names, std::back_inserter(out_ptrs),
+                         [](const std::string& name) { return name.c_str(); });
+    return out_ptrs;
 }
 
-inline void copyToOut(const cv::Mat& in, cv::Mat& out) {
-    GAPI_Assert(in.depth() == CV_32F);
-    GAPI_Assert(in.size == out.size);
-    const float* const inptr = in.ptr<float>();
-    float* const optr = out.ptr<float>();
-    const int size = in.total();
-    for (int i = 0; i < size; ++i) {
-        optr[i] = inptr[i];
+template<typename T>
+void copyToOut(const cv::Mat& onnx_out, const T end_mark, cv::Mat& gapi_out) {
+    // This function is part of some remap__ function.
+    // You can set graph output size (gapi_out) larger than real out from ONNX
+    // so you have to add something for separate correct data and garbage.
+    // For example, end of data can be marked with -1 (for positive values)
+    // or you can put size of correct data at first/last element of output matrix.
+    const size_t size = std::min(onnx_out.total(), gapi_out.total());
+    std::copy(onnx_out.begin<T>(),
+              onnx_out.begin<T>() + size,
+              gapi_out.begin<T>());
+    if (gapi_out.total() > onnx_out.total()) {
+        T* gptr = gapi_out.ptr<T>();
+        gptr[size] = end_mark;
     }
 }
 
 void remapYolo(const std::unordered_map<std::string, cv::Mat> &onnx,
-                      std::unordered_map<std::string, cv::Mat> &gapi) {
+                     std::unordered_map<std::string, cv::Mat> &gapi) {
     GAPI_Assert(onnx.size() == 1u);
     GAPI_Assert(gapi.size() == 1u);
     // Result from Run method
     const cv::Mat& in = onnx.begin()->second;
+    GAPI_Assert(in.depth() == CV_32F);
     // Configured output
     cv::Mat& out = gapi.begin()->second;
     // Simple copy
-    copyToOut(in, out);
+    copyToOut<float>(in, -1.f, out);
 }
 
-void remapSsdPorts(const std::unordered_map<std::string, cv::Mat> &onnx,
-                           std::unordered_map<std::string, cv::Mat> &gapi) {
-    // Result from Run method
-    const cv::Mat& in_num     = onnx.at("num_detections:0");
-    const cv::Mat& in_boxes   = onnx.at("detection_boxes:0");
-    const cv::Mat& in_scores  = onnx.at("detection_scores:0");
-    const cv::Mat& in_classes = onnx.at("detection_classes:0");
-    // Configured outputs
-    cv::Mat& out_boxes   = gapi.at("out1");
-    cv::Mat& out_classes = gapi.at("out2");
-    cv::Mat& out_scores  = gapi.at("out3");
-    cv::Mat& out_num     = gapi.at("out4");
+void remapYoloV3(const std::unordered_map<std::string, cv::Mat> &onnx,
+                       std::unordered_map<std::string, cv::Mat> &gapi) {
     // Simple copy for outputs
-    copyToOut(in_num, out_num);
-    copyToOut(in_boxes, out_boxes);
-    copyToOut(in_scores, out_scores);
-    copyToOut(in_classes, out_classes);
+    const cv::Mat& in_boxes = onnx.at("yolonms_layer_1/ExpandDims_1:0");
+    const cv::Mat& in_scores = onnx.at("yolonms_layer_1/ExpandDims_3:0");
+    const cv::Mat& in_indices = onnx.at("yolonms_layer_1/concat_2:0");
+    GAPI_Assert(in_boxes.depth() == CV_32F);
+    GAPI_Assert(in_scores.depth() == CV_32F);
+    GAPI_Assert(in_indices.depth() == CV_32S);
+
+    cv::Mat& out_boxes = gapi.at("out1");
+    cv::Mat& out_scores = gapi.at("out2");
+    cv::Mat& out_indices = gapi.at("out3");
+
+    copyToOut<float>(in_boxes, -1.f, out_boxes);
+    copyToOut<float>(in_scores, -1.f, out_scores);
+    copyToOut<int>(in_indices, -1, out_indices);
+}
+
+void remapToIESSDOut(const std::vector<cv::Mat> &detections,
+                           cv::Mat &ssd_output) {
+    GAPI_Assert(detections.size() == 4u);
+    for (const auto &det_el : detections) {
+        GAPI_Assert(det_el.depth() == CV_32F);
+        GAPI_Assert(!det_el.empty());
+    }
+
+    // SSD-MobilenetV1 structure check
+    ASSERT_EQ(detections[0].total(), 1u);
+    ASSERT_EQ(detections[2].total(), detections[0].total() * 100);
+    ASSERT_EQ(detections[2].total(), detections[3].total());
+    ASSERT_EQ((detections[2].total() * 4), detections[1].total());
+
+    const int num_objects = static_cast<int>(detections[0].ptr<float>()[0]);
+    GAPI_Assert(num_objects <= (ssd_output.size[2] - 1));
+    const float *in_boxes = detections[1].ptr<float>();
+    const float *in_scores = detections[2].ptr<float>();
+    const float *in_classes = detections[3].ptr<float>();
+    float *ptr = ssd_output.ptr<float>();
+
+    for (int i = 0; i < num_objects; ++i) {
+        ptr[0] = 0.f;                 // "image_id"
+        ptr[1] = in_classes[i];       // "label"
+        ptr[2] = in_scores[i];        // "confidence"
+        ptr[3] = in_boxes[4 * i + 1]; // left
+        ptr[4] = in_boxes[4 * i + 0]; // top
+        ptr[5] = in_boxes[4 * i + 3]; // right
+        ptr[6] = in_boxes[4 * i + 2]; // bottom
+
+        ptr      += 7;
+        in_boxes += 4;
+    }
+
+    if (num_objects < ssd_output.size[2] - 1) {
+        // put a -1 mark at the end of output blob if there is space left
+        ptr[0] = -1.f;
+    }
+}
+
+void remapSSDPorts(const std::unordered_map<std::string, cv::Mat> &onnx,
+                         std::unordered_map<std::string, cv::Mat> &gapi) {
+    // Assemble ONNX-processed outputs back to a single 1x1x200x7 blob
+    // to preserve compatibility with OpenVINO-based SSD pipeline
+    const cv::Mat &num_detections = onnx.at("num_detections:0");
+    const cv::Mat &detection_boxes = onnx.at("detection_boxes:0");
+    const cv::Mat &detection_scores = onnx.at("detection_scores:0");
+    const cv::Mat &detection_classes = onnx.at("detection_classes:0");
+    cv::Mat &ssd_output = gapi.at("detection_output");
+    remapToIESSDOut({num_detections, detection_boxes, detection_scores, detection_classes}, ssd_output);
+}
+
+void reallocSSDPort(const std::unordered_map<std::string, cv::Mat> &/*onnx*/,
+                          std::unordered_map<std::string, cv::Mat> &gapi) {
+    gapi["detection_boxes"].create(1000, 3000, CV_32FC3);
+}
+
+void remapRCNNPortsC(const std::unordered_map<std::string, cv::Mat> &onnx,
+                          std::unordered_map<std::string, cv::Mat> &gapi) {
+    // Simple copy for outputs
+    const cv::Mat& in_boxes = onnx.at("6379");
+    const cv::Mat& in_labels = onnx.at("6381");
+    const cv::Mat& in_scores = onnx.at("6383");
+
+    GAPI_Assert(in_boxes.depth() == CV_32F);
+    GAPI_Assert(in_labels.depth() == CV_32S);
+    GAPI_Assert(in_scores.depth() == CV_32F);
+
+    cv::Mat& out_boxes = gapi.at("out1");
+    cv::Mat& out_labels = gapi.at("out2");
+    cv::Mat& out_scores = gapi.at("out3");
+
+    copyToOut<float>(in_boxes, -1.f, out_boxes);
+    copyToOut<int>(in_labels, -1, out_labels);
+    copyToOut<float>(in_scores, -1.f, out_scores);
+}
+
+void remapRCNNPortsDO(const std::unordered_map<std::string, cv::Mat> &onnx,
+                          std::unordered_map<std::string, cv::Mat> &gapi) {
+    // Simple copy for outputs
+    const cv::Mat& in_boxes = onnx.at("6379");
+    const cv::Mat& in_scores = onnx.at("6383");
+    GAPI_Assert(in_boxes.depth()  == CV_32F);
+    GAPI_Assert(in_scores.depth() == CV_32F);
+    cv::Mat& out_boxes = gapi.at("out1");
+    cv::Mat& out_scores = gapi.at("out2");
+
+    copyToOut<float>(in_boxes, -1.f, out_boxes);
+    copyToOut<float>(in_scores, -1.f, out_scores);
 }
 
 class ONNXtest : public ::testing::Test {
@@ -139,20 +322,26 @@ public:
     cv::Mat in_mat1;
 
     ONNXtest() {
+        initTestDataPath();
         env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "test");
         memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
         out_gapi.resize(1);
-        out_onnx.resize(1);
-        // FIXME: All tests chek "random" image
-        // Ideally it should be a real image
-        in_mat1 = initMatrixRandU(CV_8UC3, cv::Size{640, 480});
+        // FIXME: It should be an image from own (gapi) directory in opencv extra
+        in_mat1 = cv::imread(findDataFile("cv/dpm/cat.png"));
     }
 
     template<typename T>
     void infer(const std::vector<cv::Mat>& ins,
-                     std::vector<cv::Mat>& outs) {
+                     std::vector<cv::Mat>& outs,
+                     std::vector<std::string>&& custom_out_names = {}) {
         // Prepare session
-        session = Ort::Session(env, model_path.data(), session_options);
+#ifndef _WIN32
+        session = Ort::Session(env, model_path.c_str(), session_options);
+#else
+        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+        std::wstring w_model_path = converter.from_bytes(model_path.c_str());
+        session = Ort::Session(env, w_model_path.c_str(), session_options);
+#endif
         num_in = session.GetInputCount();
         num_out = session.GetOutputCount();
         GAPI_Assert(num_in == ins.size());
@@ -162,7 +351,7 @@ public:
         std::vector<Ort::Value> in_tensors;
         for(size_t i = 0; i < num_in; ++i) {
             char* in_node_name_p = session.GetInputName(i, allocator);
-            in_node_names.push_back(std::string(in_node_name_p));
+            in_node_names.emplace_back(in_node_name_p);
             allocator.Free(in_node_name_p);
             in_node_dims = toORT(ins[i].size);
             in_tensors.emplace_back(Ort::Value::CreateTensor<T>(memory_info,
@@ -172,14 +361,19 @@ public:
                                                                 in_node_dims.size()));
         }
         // Outputs Run params
-        for(size_t i = 0; i < num_out; ++i) {
-            char* out_node_name_p = session.GetOutputName(i, allocator);
-            out_node_names.push_back(std::string(out_node_name_p));
-            allocator.Free(out_node_name_p);
+        if (custom_out_names.empty()) {
+            for(size_t i = 0; i < num_out; ++i) {
+                char* out_node_name_p = session.GetOutputName(i, allocator);
+                out_node_names.emplace_back(out_node_name_p);
+                allocator.Free(out_node_name_p);
+            }
+        } else {
+            out_node_names = std::move(custom_out_names);
         }
         // Input/output order by names
         const auto in_run_names  = getCharNames(in_node_names);
         const auto out_run_names = getCharNames(out_node_names);
+        num_out = out_run_names.size();
         // Run
         auto result = session.Run(Ort::RunOptions{nullptr},
                                   in_run_names.data(),
@@ -189,24 +383,30 @@ public:
                                   num_out);
         // Copy outputs
         GAPI_Assert(result.size() == num_out);
-        outs.resize(num_out);
         for (size_t i = 0; i < num_out; ++i) {
             const auto info = result[i].GetTensorTypeAndShapeInfo();
             const auto shape = info.GetShape();
-            const auto type = info.GetElementType();
-            cv::Mat mt(std::vector<int>(shape.begin(), shape.end()), toCV(type),
-                       reinterpret_cast<void*>(result[i].GetTensorMutableData<uint8_t*>()));
-            mt.copyTo(outs[i]);
+            const auto type = toCV(info.GetElementType());
+            const std::vector<int> dims(shape.begin(), shape.end());
+            outs.emplace_back(dims, type);
+            copyFromONNX(result[i], outs.back());
         }
     }
     // One input/output overload
     template<typename T>
     void infer(const cv::Mat& in, cv::Mat& out) {
         std::vector<cv::Mat> result;
-        infer<T>({in}, result);
+        infer<T>(std::vector<cv::Mat>{in}, result);
         GAPI_Assert(result.size() == 1u);
         out = result.front();
     }
+    // One input overload
+    template<typename T>
+    void infer(const cv::Mat& in,
+                     std::vector<cv::Mat>& outs,
+                     std::vector<std::string>&& custom_out_names = {}) {
+        infer<T>(std::vector<cv::Mat>{in}, outs, std::move(custom_out_names));
+    }
 
     void validate() {
         GAPI_Assert(!out_gapi.empty() && !out_onnx.empty());
@@ -232,11 +432,17 @@ private:
     std::vector<std::string> out_node_names;
 };
 
-class ONNXClassificationTest : public ONNXtest {
+class ONNXClassification : public ONNXtest {
 public:
     const cv::Scalar mean = { 0.485, 0.456, 0.406 };
     const cv::Scalar std  = { 0.229, 0.224, 0.225 };
 
+    // Rois for InferList, InferList2
+    const std::vector<cv::Rect> rois = {
+        cv::Rect(cv::Point{ 0,   0}, cv::Size{80, 120}),
+        cv::Rect(cv::Point{50, 100}, cv::Size{250, 360})
+    };
+
     void preprocess(const cv::Mat& src, cv::Mat& dst) {
         const int new_h = 224;
         const int new_w = 224;
@@ -249,7 +455,24 @@ public:
     }
 };
 
-class ONNXGRayScaleTest : public ONNXtest {
+class ONNXMediaFrame : public ONNXClassification {
+public:
+    const std::vector<cv::Rect> rois = {
+        cv::Rect(cv::Point{ 0,   0}, cv::Size{80, 120}),
+        cv::Rect(cv::Point{50, 100}, cv::Size{250, 360}),
+        cv::Rect(cv::Point{70, 10}, cv::Size{20, 260}),
+        cv::Rect(cv::Point{5, 15}, cv::Size{200, 160}),
+    };
+    cv::Mat m_in_y;
+    cv::Mat m_in_uv;
+    virtual void SetUp() {
+        cv::Size sz{640, 480};
+        m_in_y = initMatrixRandU(CV_8UC1, sz);
+        m_in_uv = initMatrixRandU(CV_8UC2, sz / 2);
+    }
+};
+
+class ONNXGRayScale : public ONNXtest {
 public:
     void preprocess(const cv::Mat& src, cv::Mat& dst) {
         const int new_h = 64;
@@ -262,15 +485,96 @@ public:
         dst = dst.reshape(1, {1, 1, new_h, new_w});
     }
 };
+
+class ONNXWithRemap : public ONNXtest {
+private:
+    size_t step_by_outs = 0;
+public:
+    // This function checks each next cv::Mat in out_gapi vector for next call.
+    // end_mark is edge of correct data
+    template <typename T>
+    void validate(const T end_mark) {
+        GAPI_Assert(!out_gapi.empty() && !out_onnx.empty());
+        ASSERT_EQ(out_gapi.size(), out_onnx.size());
+        GAPI_Assert(step_by_outs < out_gapi.size());
+        const T* op = out_onnx.at(step_by_outs).ptr<T>();
+        const T* gp = out_gapi.at(step_by_outs).ptr<T>();
+        // Checking that graph output larger than onnx output
+        const auto out_size = std::min(out_onnx.at(step_by_outs).total(), out_gapi.at(step_by_outs).total());
+        GAPI_Assert(out_size != 0u);
+        for (size_t d_idx = 0; d_idx < out_size; ++d_idx) {
+            if (gp[d_idx] == end_mark) break;
+            ASSERT_EQ(op[d_idx], gp[d_idx]);
+        }
+        ++step_by_outs;
+    }
+};
+
+class ONNXRCNN : public ONNXWithRemap {
+private:
+    const cv::Scalar rcnn_mean = { 102.9801, 115.9465, 122.7717 };
+    const float range_max = 1333;
+    const float range_min = 800;
+public:
+    void preprocess(const cv::Mat& src, cv::Mat& dst) {
+        cv::Mat rsz, cvt, chw, mn;
+        const auto get_ratio = [&](const int dim) -> float {
+                                   return ((dim > range_max) || (dim < range_min))
+                                              ? dim > range_max
+                                                  ? range_max / dim
+                                                  : range_min / dim
+                                              : 1.f;
+                               };
+        const auto ratio_h = get_ratio(src.rows);
+        const auto ratio_w = get_ratio(src.cols);
+        const auto new_h = static_cast<int>(ratio_h * src.rows);
+        const auto new_w = static_cast<int>(ratio_w * src.cols);
+        cv::resize(src, rsz, cv::Size(new_w, new_h));
+        rsz.convertTo(cvt, CV_32F, 1.f);
+        toCHW(cvt, chw);
+        mn = chw - rcnn_mean;
+        const int padded_h = std::ceil(new_h / 32.f) * 32;
+        const int padded_w = std::ceil(new_w / 32.f) * 32;
+        cv::Mat pad_im(cv::Size(padded_w, 3 * padded_h), CV_32F, 0.f);
+        pad_im(cv::Rect(0, 0, mn.cols, mn.rows)) += mn;
+        dst = pad_im.reshape(1, {3, padded_h, padded_w});
+    }
+};
+
+class ONNXYoloV3 : public ONNXWithRemap {
+public:
+    std::vector<cv::Mat> ins;
+
+private:
+    virtual void SetUp() {
+        const int yolo_in_h = 416;
+        const int yolo_in_w = 416;
+        cv::Mat yolov3_input, shape, prep_mat;
+        cv::resize(in_mat1, yolov3_input, cv::Size(yolo_in_w, yolo_in_h));
+        shape.create(cv::Size(2, 1), CV_32F);
+        float* ptr = shape.ptr<float>();
+        ptr[0] = in_mat1.cols;
+        ptr[1] = in_mat1.rows;
+        preprocess(yolov3_input, prep_mat);
+        ins = {prep_mat, shape};
+    }
+
+    void preprocess(const cv::Mat& src, cv::Mat& dst) {
+        cv::Mat cvt;
+        src.convertTo(cvt, CV_32F, 1.f / 255.f);
+        toCHW(cvt, dst);
+        dst = dst.reshape(1, {1, 3, 416, 416});
+    }
+};
 } // anonymous namespace
 
-TEST_F(ONNXClassificationTest, Infer)
+TEST_F(ONNXClassification, Infer)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
     // ONNX_API code
     cv::Mat processed_mat;
     preprocess(in_mat1, processed_mat);
-    infer<float>(processed_mat, out_onnx.front());
+    infer<float>(processed_mat, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
     cv::GMat in;
@@ -286,17 +590,14 @@ TEST_F(ONNXClassificationTest, Infer)
     validate();
 }
 
-TEST_F(ONNXtest, InferTensor)
+TEST_F(ONNXClassification, InferTensor)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
     // Create tensor
-    // FIXME: Test cheks "random" image
-    // Ideally it should be a real image
-    const cv::Mat rand_mat = initMatrixRandU(CV_32FC3, cv::Size{224, 224});
-    const std::vector<int> dims = {1, rand_mat.channels(), rand_mat.rows, rand_mat.cols};
-    const cv::Mat tensor(dims, CV_32F, rand_mat.data);
+    cv::Mat tensor;
+    preprocess(in_mat1, tensor);
     // ONNX_API code
-    infer<float>(tensor, out_onnx.front());
+    infer<float>(tensor, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
     cv::GMat in;
@@ -310,14 +611,14 @@ TEST_F(ONNXtest, InferTensor)
     validate();
 }
 
-TEST_F(ONNXClassificationTest, InferROI)
+TEST_F(ONNXClassification, InferROI)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    const cv::Rect ROI(cv::Point{0, 0}, cv::Size{250, 250});
+    const auto ROI = rois.at(0);
     // ONNX_API code
     cv::Mat roi_mat;
     preprocess(in_mat1(ROI), roi_mat);
-    infer<float>(roi_mat, out_onnx.front());
+    infer<float>(roi_mat, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
     cv::GMat in;
@@ -334,19 +635,14 @@ TEST_F(ONNXClassificationTest, InferROI)
     validate();
 }
 
-TEST_F(ONNXClassificationTest, InferROIList)
+TEST_F(ONNXClassification, InferROIList)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    const std::vector<cv::Rect> rois = {
-        cv::Rect(cv::Point{ 0,   0}, cv::Size{80, 120}),
-        cv::Rect(cv::Point{50, 100}, cv::Size{250, 360}),
-    };
     // ONNX_API code
-    out_onnx.resize(rois.size());
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
         preprocess(in_mat1(rois[i]), roi_mat);
-        infer<float>(roi_mat, out_onnx[i]);
+        infer<float>(roi_mat, out_onnx);
     }
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
@@ -364,19 +660,14 @@ TEST_F(ONNXClassificationTest, InferROIList)
     validate();
 }
 
-TEST_F(ONNXClassificationTest, Infer2ROIList)
+TEST_F(ONNXClassification, Infer2ROIList)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    const std::vector<cv::Rect> rois = {
-        cv::Rect(cv::Point{ 0,   0}, cv::Size{80, 120}),
-        cv::Rect(cv::Point{50, 100}, cv::Size{250, 360}),
-    };
     // ONNX_API code
-    out_onnx.resize(rois.size());
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
         preprocess(in_mat1(rois[i]), roi_mat);
-        infer<float>(roi_mat, out_onnx[i]);
+        infer<float>(roi_mat, out_onnx);
     }
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
@@ -394,40 +685,39 @@ TEST_F(ONNXClassificationTest, Infer2ROIList)
     validate();
 }
 
-TEST_F(ONNXtest, InferDynamicInputTensor)
+TEST_F(ONNXWithRemap, InferDynamicInputTensor)
 {
     useModel("object_detection_segmentation/tiny-yolov2/model/tinyyolov2-8");
     // Create tensor
-    // FIXME: Test cheks "random" image
-    // Ideally it should be a real image
-    const cv::Mat rand_mat = initMatrixRandU(CV_32FC3, cv::Size{416, 416});
-    const std::vector<int> dims = {1, rand_mat.channels(), rand_mat.rows, rand_mat.cols};
-    cv::Mat tensor(dims, CV_32F, rand_mat.data);
-    const cv::Mat in_tensor = tensor / 255.f;
+    cv::Mat cvt, rsz, tensor;
+    cv::resize(in_mat1, rsz, cv::Size{416, 416});
+    rsz.convertTo(cvt, CV_32F, 1.f / 255.f);
+    toCHW(cvt, tensor);
+    tensor = tensor.reshape(1, {1, 3, 416, 416});
     // ONNX_API code
-    infer<float>(in_tensor, out_onnx.front());
+    infer<float>(tensor, out_onnx);
     // G_API code
     G_API_NET(YoloNet, <cv::GMat(cv::GMat)>, "YoloNet");
     cv::GMat in;
     cv::GMat out = cv::gapi::infer<YoloNet>(in);
     cv::GComputation comp(cv::GIn(in), cv::GOut(out));
-    auto net = cv::gapi::onnx::Params<YoloNet>{model_path}
+    auto net = cv::gapi::onnx::Params<YoloNet>{ model_path }
         .cfgPostProc({cv::GMatDesc{CV_32F, {1, 125, 13, 13}}}, remapYolo)
         .cfgOutputLayers({"out"});
-    comp.apply(cv::gin(in_tensor),
+    comp.apply(cv::gin(tensor),
                cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
-    validate();
+    validate<float>(-1.f);
 }
 
-TEST_F(ONNXGRayScaleTest, InferImage)
+TEST_F(ONNXGRayScale, InferImage)
 {
     useModel("body_analysis/emotion_ferplus/model/emotion-ferplus-8");
     // ONNX_API code
     cv::Mat prep_mat;
     preprocess(in_mat1, prep_mat);
-    infer<float>(prep_mat, out_onnx.front());
+    infer<float>(prep_mat, out_onnx);
     // G_API code
     G_API_NET(EmotionNet, <cv::GMat(cv::GMat)>, "emotion-ferplus");
     cv::GMat in;
@@ -442,32 +732,375 @@ TEST_F(ONNXGRayScaleTest, InferImage)
     validate();
 }
 
-TEST_F(ONNXtest, InferMultOutput)
+TEST_F(ONNXWithRemap, InferMultiOutput)
 {
     useModel("object_detection_segmentation/ssd-mobilenetv1/model/ssd_mobilenet_v1_10");
     // ONNX_API code
     const auto prep_mat = in_mat1.reshape(1, {1, in_mat1.rows, in_mat1.cols, in_mat1.channels()});
-    infer<uint8_t>({prep_mat}, out_onnx);
+    infer<uint8_t>(prep_mat, out_onnx);
+    cv::Mat onnx_conv_out({1, 1, 200, 7}, CV_32F);
+    remapToIESSDOut({out_onnx[3], out_onnx[0], out_onnx[2], out_onnx[1]}, onnx_conv_out);
+    out_onnx.clear();
+    out_onnx.push_back(onnx_conv_out);
     // G_API code
-    using SSDOut = std::tuple<cv::GMat, cv::GMat, cv::GMat, cv::GMat>;
-    G_API_NET(MobileNet, <SSDOut(cv::GMat)>, "ssd_mobilenet");
+    G_API_NET(MobileNet, <cv::GMat(cv::GMat)>, "ssd_mobilenet");
     cv::GMat in;
-    cv::GMat out1, out2, out3, out4;
-    std::tie(out1, out2, out3, out4) = cv::gapi::infer<MobileNet>(in);
-    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2, out3, out4));
-    auto net = cv::gapi::onnx::Params<MobileNet>{model_path}
-        .cfgOutputLayers({"out1", "out2", "out3", "out4"})
-        .cfgPostProc({cv::GMatDesc{CV_32F, {1, 100, 4}},
-                      cv::GMatDesc{CV_32F, {1, 100}},
-                      cv::GMatDesc{CV_32F, {1, 100}},
-                      cv::GMatDesc{CV_32F, {1, 1}}}, remapSsdPorts);
-    out_gapi.resize(num_out);
+    cv::GMat out = cv::gapi::infer<MobileNet>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+    auto net = cv::gapi::onnx::Params<MobileNet>{ model_path }
+        .cfgOutputLayers({"detection_output"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {1, 1, 200, 7}}}, remapSSDPorts);
     comp.apply(cv::gin(in_mat1),
-               cv::gout(out_gapi[0], out_gapi[1], out_gapi[2], out_gapi[3]),
+               cv::gout(out_gapi.front()),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate<float>(-1.f);
+}
+
+TEST_F(ONNXMediaFrame, InferBGR)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    // ONNX_API code
+    cv::Mat processed_mat;
+    preprocess(in_mat1, processed_mat);
+    infer<float>(processed_mat, out_onnx);
+    // G_API code
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GMat out = cv::gapi::infer<SqueezNet>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame),
+               cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
     validate();
 }
+
+TEST_F(ONNXMediaFrame, InferYUV)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    // ONNX_API code
+    cv::Mat pp;
+    cvtColorTwoPlane(m_in_y, m_in_uv, pp, cv::COLOR_YUV2BGR_NV12);
+    cv::Mat processed_mat;
+    preprocess(pp, processed_mat);
+    infer<float>(processed_mat, out_onnx);
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GMat out = cv::gapi::infer<SqueezNet>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame),
+               cv::gout(out_gapi.front()),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXMediaFrame, InferROIBGR)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    // ONNX_API code
+    cv::Mat roi_mat;
+    preprocess(in_mat1(rois.front()), roi_mat);
+    infer<float>(roi_mat, out_onnx);
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> rect;
+    cv::GMat out = cv::gapi::infer<SqueezNet>(rect, in);
+    cv::GComputation comp(cv::GIn(in, rect), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois.front()),
+               cv::gout(out_gapi.front()),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXMediaFrame, InferROIYUV)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    // ONNX_API code
+    cv::Mat pp;
+    cvtColorTwoPlane(m_in_y, m_in_uv, pp, cv::COLOR_YUV2BGR_NV12);
+    cv::Mat roi_mat;
+    preprocess(pp(rois.front()), roi_mat);
+    infer<float>(roi_mat, out_onnx);
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> rect;
+    cv::GMat out = cv::gapi::infer<SqueezNet>(rect, in);
+    cv::GComputation comp(cv::GIn(in, rect), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois.front()),
+               cv::gout(out_gapi.front()),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXMediaFrame, InferListBGR)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    // ONNX_API code
+    for (size_t i = 0; i < rois.size(); ++i) {
+        cv::Mat roi_mat;
+        preprocess(in_mat1(rois[i]), roi_mat);
+        infer<float>(roi_mat, out_onnx);
+    }
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> out = cv::gapi::infer<SqueezNet>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois),
+               cv::gout(out_gapi),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXMediaFrame, InferListYUV)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    // ONNX_API code
+    cv::Mat pp;
+    cvtColorTwoPlane(m_in_y, m_in_uv, pp, cv::COLOR_YUV2BGR_NV12);
+    for (size_t i = 0; i < rois.size(); ++i) {
+        cv::Mat roi_mat;
+        preprocess(pp(rois[i]), roi_mat);
+        infer<float>(roi_mat, out_onnx);
+    }
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> out = cv::gapi::infer<SqueezNet>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois),
+               cv::gout(out_gapi),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+TEST_F(ONNXRCNN, InferWithDisabledOut)
+{
+    useModel("object_detection_segmentation/faster-rcnn/model/FasterRCNN-10");
+    cv::Mat pp;
+    preprocess(in_mat1, pp);
+    // ONNX_API code
+    infer<float>(pp, out_onnx, {"6379", "6383"});
+    // G_API code
+    using FRCNNOUT = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(FasterRCNN, <FRCNNOUT(cv::GMat)>, "FasterRCNN");
+    auto net = cv::gapi::onnx::Params<FasterRCNN>{model_path}
+        .cfgOutputLayers({"out1", "out2"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {7,4}},
+                      cv::GMatDesc{CV_32F, {7}}}, remapRCNNPortsDO, {"6383", "6379"});
+    cv::GMat in, out1, out2;
+    std::tie(out1, out2) = cv::gapi::infer<FasterRCNN>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2));
+    out_gapi.resize(num_out);
+    comp.apply(cv::gin(pp),
+               cv::gout(out_gapi[0], out_gapi[1]),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate<float>(-1.f);
+    validate<float>(-1.f);
+}
+
+TEST_F(ONNXMediaFrame, InferList2BGR)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    // ONNX_API code
+    for (size_t i = 0; i < rois.size(); ++i) {
+        cv::Mat roi_mat;
+        preprocess(in_mat1(rois[i]), roi_mat);
+        infer<float>(roi_mat, out_onnx);
+    }
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> out = cv::gapi::infer2<SqueezNet>(in, rr);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois),
+               cv::gout(out_gapi),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXMediaFrame, InferList2YUV)
+{
+    useModel("classification/squeezenet/model/squeezenet1.0-9");
+    const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    // ONNX_API code
+    cv::Mat pp;
+    cvtColorTwoPlane(m_in_y, m_in_uv, pp, cv::COLOR_YUV2BGR_NV12);
+    for (size_t i = 0; i < rois.size(); ++i) {
+        cv::Mat roi_mat;
+        preprocess(pp(rois[i]), roi_mat);
+        infer<float>(roi_mat, out_onnx);
+    }
+    // G_API code
+    G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
+    cv::GFrame in;
+    cv::GArray<cv::Rect> rr;
+    cv::GArray<cv::GMat> out = cv::gapi::infer2<SqueezNet>(in, rr);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(out));
+    // NOTE: We have to normalize U8 tensor
+    // so cfgMeanStd() is here
+    auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
+    comp.apply(cv::gin(frame, rois),
+               cv::gout(out_gapi),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate();
+}
+
+TEST_F(ONNXYoloV3, InferConstInput)
+{
+    useModel("object_detection_segmentation/yolov3/model/yolov3-10");
+    // ONNX_API code
+    infer<float>(ins, out_onnx);
+    // G_API code
+    using OUT = std::tuple<cv::GMat, cv::GMat, cv::GMat>;
+    G_API_NET(YoloNet, <OUT(cv::GMat)>, "yolov3");
+    auto net = cv::gapi::onnx::Params<YoloNet>{model_path}
+        .constInput("image_shape", ins[1])
+        .cfgInputLayers({"input_1"})
+        .cfgOutputLayers({"out1", "out2", "out3"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {1, 10000, 4}},
+                      cv::GMatDesc{CV_32F, {1, 80, 10000}},
+                      cv::GMatDesc{CV_32S, {5, 3}}}, remapYoloV3);
+    cv::GMat in, out1, out2, out3;
+    std::tie(out1, out2, out3) = cv::gapi::infer<YoloNet>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2, out3));
+    out_gapi.resize(num_out);
+    comp.apply(cv::gin(ins[0]),
+               cv::gout(out_gapi[0], out_gapi[1], out_gapi[2]),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate<float>(-1.f);
+    validate<float>(-1.f);
+    validate<int>(-1);
+}
+
+TEST_F(ONNXYoloV3, InferBSConstInput)
+{
+    // This test checks the case when a const input is used
+    // and all input layer names are specified.
+    // Const input has the advantage. It is expected behavior.
+    useModel("object_detection_segmentation/yolov3/model/yolov3-10");
+    // Tensor with incorrect image size
+    // is used for check case when InputLayers and constInput have same names
+    cv::Mat bad_shape;
+    bad_shape.create(cv::Size(2, 1), CV_32F);
+    float* ptr = bad_shape.ptr<float>();
+    ptr[0] = 590;
+    ptr[1] = 12;
+    // ONNX_API code
+    infer<float>(ins, out_onnx);
+    // G_API code
+    using OUT = std::tuple<cv::GMat, cv::GMat, cv::GMat>;
+    G_API_NET(YoloNet, <OUT(cv::GMat, cv::GMat)>, "yolov3");
+    auto net = cv::gapi::onnx::Params<YoloNet>{model_path}
+    // Data from const input will be used to infer
+        .constInput("image_shape", ins[1])
+    // image_shape - const_input has same name
+        .cfgInputLayers({"input_1", "image_shape"})
+        .cfgOutputLayers({"out1", "out2", "out3"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {1, 10000, 4}},
+                      cv::GMatDesc{CV_32F, {1, 80, 10000}},
+                      cv::GMatDesc{CV_32S, {5, 3}}}, remapYoloV3);
+    cv::GMat in1, in2, out1, out2, out3;
+    std::tie(out1, out2, out3) = cv::gapi::infer<YoloNet>(in1, in2);
+    cv::GComputation comp(cv::GIn(in1, in2), cv::GOut(out1, out2, out3));
+    out_gapi.resize(num_out);
+    comp.apply(cv::gin(ins[0], bad_shape),
+               cv::gout(out_gapi[0], out_gapi[1], out_gapi[2]),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate<float>(-1.f);
+    validate<float>(-1.f);
+    validate<int>(-1);
+}
+
+TEST_F(ONNXRCNN, ConversionInt64to32)
+{
+    useModel("object_detection_segmentation/faster-rcnn/model/FasterRCNN-10");
+    cv::Mat dst;
+    preprocess(in_mat1, dst);
+    // ONNX_API code
+    infer<float>(dst, out_onnx);
+    // G_API code
+    using FRCNNOUT = std::tuple<cv::GMat,cv::GMat,cv::GMat>;
+    G_API_NET(FasterRCNN, <FRCNNOUT(cv::GMat)>, "FasterRCNN");
+    auto net = cv::gapi::onnx::Params<FasterRCNN>{model_path}
+        .cfgOutputLayers({"out1", "out2", "out3"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {7,4}},
+                      cv::GMatDesc{CV_32S, {7}},
+                      cv::GMatDesc{CV_32F, {7}}}, remapRCNNPortsC);
+    cv::GMat in, out1, out2, out3;
+    std::tie(out1, out2, out3) = cv::gapi::infer<FasterRCNN>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2, out3));
+    out_gapi.resize(num_out);
+    comp.apply(cv::gin(dst),
+               cv::gout(out_gapi[0], out_gapi[1], out_gapi[2]),
+               cv::compile_args(cv::gapi::networks(net)));
+    // Validate
+    validate<float>(-1.f);
+    validate<int>(-1);
+    validate<float>(-1.f);
+}
+
+TEST_F(ONNXWithRemap, InferOutReallocation)
+{
+    useModel("object_detection_segmentation/ssd-mobilenetv1/model/ssd_mobilenet_v1_10");
+    // G_API code
+    G_API_NET(MobileNet, <cv::GMat(cv::GMat)>, "ssd_mobilenet");
+    auto net = cv::gapi::onnx::Params<MobileNet>{model_path}
+        .cfgOutputLayers({"detection_boxes"})
+        .cfgPostProc({cv::GMatDesc{CV_32F, {1,100,4}}}, reallocSSDPort);
+    cv::GMat in;
+    cv::GMat out1;
+    out1 = cv::gapi::infer<MobileNet>(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1));
+    EXPECT_THROW(comp.apply(cv::gin(in_mat1),
+                 cv::gout(out_gapi[0]),
+                 cv::compile_args(cv::gapi::networks(net))), std::exception);
+}
+
 } // namespace opencv_test
 
 #endif //  HAVE_ONNX
diff --git a/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp b/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp
index c9d9926542..4291fd00e9 100644
--- a/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp
+++ b/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp
@@ -213,7 +213,7 @@ TEST(GModelBuilder, Check_Multiple_Outputs)
     EXPECT_EQ(0u, gm.metadata(p.out_nhs[1]->inEdges().front()).get<cv::gimpl::Output>().port);
     EXPECT_EQ(1u, gm.metadata(p.out_nhs[2]->inEdges().front()).get<cv::gimpl::Output>().port);
     EXPECT_EQ(0u, gm.metadata(p.out_nhs[3]->inEdges().front()).get<cv::gimpl::Output>().port);
-    for (const auto& it : ade::util::indexed(p.out_nhs))
+    for (const auto it : ade::util::indexed(p.out_nhs))
     {
         const auto& out_nh = ade::util::value(it);
 
diff --git a/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp b/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp
index 723e42a6df..073be2381a 100644
--- a/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp
+++ b/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp
@@ -527,6 +527,7 @@ TEST(IslandFusion, Test_Desync_NoFuse)
 
     //////////////////////////////////////////////////////////////////
     // Compile the graph in "regular" mode, it should produce a single island
+    // Note: with copy moved to a separate backend there is always 3 islands in this test
     {
         using namespace cv::gimpl;
 
@@ -544,11 +545,12 @@ TEST(IslandFusion, Test_Desync_NoFuse)
         const auto num_isl = std::count_if(gim.nodes().begin(),
                                            gim.nodes().end(),
                                            is_island);
-        EXPECT_EQ(1, num_isl);
+        EXPECT_EQ(3, num_isl);
     }
     //////////////////////////////////////////////////////////////////
     // Now compile the graph in the streaming mode.
     // It has to produce two islands
+    // Note: with copy moved to a separate backend there is always 3 islands in this test
     {
         using namespace cv::gimpl;
 
@@ -567,7 +569,7 @@ TEST(IslandFusion, Test_Desync_NoFuse)
         const auto num_isl = std::count_if(gim.nodes().begin(),
                                            gim.nodes().end(),
                                            is_island);
-        EXPECT_EQ(2, num_isl);
+        EXPECT_EQ(3, num_isl);
     }
 }
 
diff --git a/modules/gapi/test/render/gapi_render_tests_ocv.cpp b/modules/gapi/test/render/gapi_render_tests_ocv.cpp
index 88b5d88075..010df5dff7 100644
--- a/modules/gapi/test/render/gapi_render_tests_ocv.cpp
+++ b/modules/gapi/test/render/gapi_render_tests_ocv.cpp
@@ -73,6 +73,54 @@ TEST_P(RenderNV12OCVTestTexts, AccuracyTest)
     }
 }
 
+class TestMediaNV12 final : public cv::MediaFrame::IAdapter {
+    cv::Mat m_y;
+    cv::Mat m_uv;
+public:
+    TestMediaNV12(cv::Mat y, cv::Mat uv) : m_y(y), m_uv(uv) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{ cv::MediaFormat::NV12, cv::Size(m_y.cols, m_y.rows) };
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = {
+            m_y.ptr(), m_uv.ptr(), nullptr, nullptr
+        };
+        cv::MediaFrame::View::Strides ss = {
+            m_y.step, m_uv.step, 0u, 0u
+        };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss));
+    }
+};
+
+TEST_P(RenderMFrameOCVTestTexts, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Text{ text, org, ff, fs, color, thick, lt, blo });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        cv::putText(yuv, text, org, ff, fs, cvtBGRToYUVC(color), thick, lt, blo);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
+
 # ifdef HAVE_FREETYPE
 
 TEST_P(RenderBGROCVTestFTexts, AccuracyTest)
@@ -97,6 +145,23 @@ TEST_P(RenderNV12OCVTestFTexts, AccuracyTest)
                                 })));
 }
 
+TEST_P(RenderMFrameOCVTestFTexts, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::Mat y_copy_mat = y_gapi_mat.clone();
+    cv::Mat uv_copy_mat = uv_gapi_mat.clone();
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::FText{ text, org, fh, color });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    EXPECT_NO_THROW(cv::gapi::wip::draw::render(nv12, prims,
+        cv::compile_args(cv::gapi::wip::draw::freetype_font{
+        "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
+            })));
+    EXPECT_NE(0, cv::norm(y_gapi_mat, y_copy_mat));
+    EXPECT_NE(0, cv::norm(uv_gapi_mat, uv_copy_mat));
+}
+
+
 static std::wstring to_wstring(const char* bytes)
 {
     std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
@@ -164,6 +229,33 @@ TEST_P(RenderNV12OCVTestRects, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestRects, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Rect{ rect, color, thick, lt, shift });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        cv::rectangle(yuv, rect, cvtBGRToYUVC(color), thick, lt, shift);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 TEST_P(RenderBGROCVTestCircles, AccuracyTest)
 {
     // G-API code //////////////////////////////////////////////////////////////
@@ -208,6 +300,33 @@ TEST_P(RenderNV12OCVTestCircles, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestCircles, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Circle{ center, radius, color, thick, lt, shift });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        cv::circle(yuv, center, radius, cvtBGRToYUVC(color), thick, lt, shift);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 TEST_P(RenderBGROCVTestLines, AccuracyTest)
 {
     // G-API code //////////////////////////////////////////////////////////////
@@ -252,6 +371,33 @@ TEST_P(RenderNV12OCVTestLines, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestLines, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Line{ pt1, pt2, color, thick, lt, shift });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        cv::line(yuv, pt1, pt2, cvtBGRToYUVC(color), thick, lt, shift);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 TEST_P(RenderBGROCVTestMosaics, AccuracyTest)
 {
     // G-API code //////////////////////////////////////////////////////////////
@@ -296,6 +442,33 @@ TEST_P(RenderNV12OCVTestMosaics, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestMosaics, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Mosaic{ mos, cellsz, decim });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        drawMosaicRef(yuv, mos, cellsz);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 TEST_P(RenderBGROCVTestImages, AccuracyTest)
 {
     cv::Mat img(rect.size(), CV_8UC3, color);
@@ -352,6 +525,40 @@ TEST_P(RenderNV12OCVTestImages, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestImages, AccuracyTest)
+{
+    cv::Mat img(rect.size(), CV_8UC3, color);
+    cv::Mat alpha(rect.size(), CV_32FC1, transparency);
+    auto tl = rect.tl();
+    cv::Point org = { tl.x, tl.y + rect.size().height };
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Image{ org, img, alpha });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        cv::Mat yuv_img;
+        cv::cvtColor(img, yuv_img, cv::COLOR_BGR2YUV);
+        blendImageRef(yuv, org, yuv_img, alpha);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 TEST_P(RenderBGROCVTestPolylines, AccuracyTest)
 {
     // G-API code //////////////////////////////////////////////////////////////
@@ -398,6 +605,34 @@ TEST_P(RenderNV12OCVTestPolylines, AccuracyTest)
     }
 }
 
+TEST_P(RenderMFrameOCVTestPolylines, AccuracyTest)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::gapi::wip::draw::Prims prims;
+    prims.emplace_back(cv::gapi::wip::draw::Poly{ points, color, thick, lt, shift });
+    cv::MediaFrame nv12 = cv::MediaFrame::Create<TestMediaNV12>(y_gapi_mat, uv_gapi_mat);
+    cv::gapi::wip::draw::render(nv12, prims);
+
+    // OpenCV code //////////////////////////////////////////////////////////////
+    {
+        // NV12 -> YUV
+        cv::Mat yuv;
+        cv::gapi::wip::draw::cvtNV12ToYUV(y_ref_mat, uv_ref_mat, yuv);
+
+        std::vector<std::vector<cv::Point>> pp{ points };
+        cv::fillPoly(yuv, pp, cvtBGRToYUVC(color), lt, shift);
+
+        // YUV -> NV12
+        cv::gapi::wip::draw::cvtYUVToNV12(yuv, y_ref_mat, uv_ref_mat);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::norm(y_gapi_mat, y_ref_mat));
+        EXPECT_EQ(0, cv::norm(uv_gapi_mat, uv_ref_mat));
+    }
+}
+
 // FIXME avoid code duplicate for NV12 and BGR cases
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestRectsImpl, RenderBGROCVTestRects,
                         Combine(Values(cv::Size(1280, 720)),
@@ -415,6 +650,14 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestRectsImpl, RenderNV12OCVTestRects,
                                        Values(LINE_8),
                                        Values(0)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestRectsImpl, RenderMFrameOCVTestRects,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(cv::Rect(100, 100, 200, 200)),
+                                Values(cv::Scalar(100, 50, 150)),
+                                Values(2),
+                                Values(LINE_8),
+                                Values(0)));
+
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestCirclesImpl, RenderBGROCVTestCircles,
                         Combine(Values(cv::Size(1280, 720)),
                                 Values(cv::Point(100, 100)),
@@ -433,6 +676,15 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestCirclesImpl, RenderNV12OCVTestCircles,
                                 Values(LINE_8),
                                 Values(0)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestCirclesImpl, RenderMFrameOCVTestCircles,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(cv::Point(100, 100)),
+                                Values(10),
+                                Values(cv::Scalar(100, 50, 150)),
+                                Values(2),
+                                Values(LINE_8),
+                                Values(0)));
+
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestLinesImpl, RenderBGROCVTestLines,
                         Combine(Values(cv::Size(1280, 720)),
                                 Values(cv::Point(100, 100)),
@@ -451,6 +703,15 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestLinesImpl, RenderNV12OCVTestLines,
                                 Values(LINE_8),
                                 Values(0)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestLinesImpl, RenderMFrameOCVTestLines,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(cv::Point(100, 100)),
+                                Values(cv::Point(200, 200)),
+                                Values(cv::Scalar(100, 50, 150)),
+                                Values(2),
+                                Values(LINE_8),
+                                Values(0)));
+
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestTextsImpl, RenderBGROCVTestTexts,
                         Combine(Values(cv::Size(1280, 720)),
                                 Values("SomeText"),
@@ -473,6 +734,18 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestTextsImpl, RenderNV12OCVTestTexts,
                                 Values(LINE_8),
                                 Values(false)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestTextsImpl, RenderMFrameOCVTestTexts,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values("SomeText"),
+                                Values(cv::Point(200, 200)),
+                                Values(FONT_HERSHEY_SIMPLEX),
+                                Values(2.0),
+                                Values(cv::Scalar(0, 255, 0)),
+                                Values(2),
+                                Values(LINE_8),
+                                Values(false)));
+
+
 #ifdef HAVE_FREETYPE
 
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestFTextsImpl, RenderBGROCVTestFTexts,
@@ -490,6 +763,15 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestFTextsImpl, RenderNV12OCVTestFTexts,
                             Values(cv::Point(200, 200)),
                             Values(64),
                             Values(cv::Scalar(0, 255, 0))));
+
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestFTextsImpl, RenderMFrameOCVTestFTexts,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(to_wstring("\xe4\xbd\xa0\xe5\xa5\xbd\xef\xbc\x8c\xe4\xb8\x96\xe7\x95\x8c"),
+                                to_wstring("\xe3\x80\xa4\xe3\x80\xa5\xe3\x80\xa6\xe3\x80\xa7\xe3\x80\xa8\xe3\x80\x85\xe3\x80\x86")),
+                                Values(cv::Point(200, 200)),
+                                Values(64),
+                                Values(cv::Scalar(0, 255, 0))));
+
 #endif // HAVE_FREETYPE
 
 // FIXME Implement a macros to instantiate the tests because BGR and NV12 have the same parameters
@@ -530,6 +812,24 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestMosaicsImpl, RenderNV12OCVTestMosaics,
                                 Values(25),
                                 Values(0)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestMosaicsImpl, RenderMFrameOCVTestMosaics,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(cv::Rect(100, 100, 200, 200),      // Normal case
+                                       cv::Rect(-50, -50, 200, 200),      // Intersection with left-top corner
+                                       cv::Rect(-50, 100, 200, 200),      // Intersection with left side
+                                       cv::Rect(-50, 600, 200, 200),      // Intersection with left-bottom corner
+                                       cv::Rect(100, 600, 200, 200),      // Intersection with bottom side
+                                       cv::Rect(1200, 700, 200, 200),     // Intersection with right-bottom corner
+                                       cv::Rect(1200, 400, 200, 200),     // Intersection with right side
+                                       cv::Rect(1200, -50, 200, 200),     // Intersection with right-top corner
+                                       cv::Rect(500, -50, 200, 200),      // Intersection with top side
+                                       cv::Rect(-100, 300, 1480, 300),    // From left to right side with intersection
+                                       cv::Rect(5000, 2000, 100, 100),    // Outside image
+                                       cv::Rect(-300, -300, 3000, 3000),  // Cover all image
+                                       cv::Rect(100, 100, -500, -500)),   // Negative width and height
+                                Values(25),
+                                Values(0)));
+
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestImagesImpl, RenderBGROCVTestImages,
                         Combine(Values(cv::Size(1280, 720)),
                                 Values(cv::Rect(100, 100, 200, 200)),
@@ -542,6 +842,12 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestImagesImpl, RenderNV12OCVTestImages,
                                 Values(cv::Scalar(100, 150, 60)),
                                 Values(1.0)));
 
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestImagesImpl, RenderMFrameOCVTestImages,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(cv::Rect(100, 100, 200, 200)),
+                                Values(cv::Scalar(100, 150, 60)),
+                                Values(1.0)));
+
 INSTANTIATE_TEST_CASE_P(RenderBGROCVTestPolylinesImpl, RenderBGROCVTestPolylines,
                         Combine(Values(cv::Size(1280, 720)),
                                 Values(std::vector<cv::Point>{{100, 100}, {200, 200}, {150, 300}, {400, 150}}),
@@ -557,4 +863,12 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestPolylinesImpl, RenderNV12OCVTestPolylin
                                 Values(2),
                                 Values(LINE_8),
                                 Values(0)));
+
+INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestPolylinesImpl, RenderMFrameOCVTestPolylines,
+                        Combine(Values(cv::Size(1280, 720)),
+                                Values(std::vector<cv::Point>{ {100, 100}, { 200, 200 }, { 150, 300 }, { 400, 150 }}),
+                                Values(cv::Scalar(100, 150, 60)),
+                                Values(2),
+                                Values(LINE_8),
+                                Values(0)));
 }
diff --git a/modules/gapi/test/s11n/gapi_s11n_tests.cpp b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
index 16f19846ed..f4a30b3946 100644
--- a/modules/gapi/test/s11n/gapi_s11n_tests.cpp
+++ b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
@@ -571,6 +571,16 @@ TEST_F(S11N_Basic, Test_Bind_RunArgs_MatScalar) {
     }
 }
 
+TEST_F(S11N_Basic, Test_Vector_Of_Strings) {
+    std::vector<std::string> vs{"hello", "world", "42"};
+
+    const std::vector<char> ser = cv::gapi::serialize(vs);
+    auto des = cv::gapi::deserialize<std::vector<std::string>>(ser);
+    EXPECT_EQ("hello", des[0]);
+    EXPECT_EQ("world", des[1]);
+    EXPECT_EQ("42", des[2]);
+}
+
 TEST_F(S11N_Basic, Test_RunArg_RMat) {
     cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
     cv::RMat rmat = cv::make_rmat<MyRMatAdapter>(mat, 42, "It actually works");
diff --git a/modules/gapi/test/streaming/gapi_streaming_sync_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_sync_tests.cpp
new file mode 100644
index 0000000000..f62e466158
--- /dev/null
+++ b/modules/gapi/test/streaming/gapi_streaming_sync_tests.cpp
@@ -0,0 +1,220 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include "../test_precomp.hpp"
+
+#include <opencv2/gapi/streaming/cap.hpp>
+#include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/fluid/imgproc.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
+#include <opencv2/gapi/streaming/sync.hpp>
+
+namespace opencv_test {
+namespace {
+
+using ts_t = int64_t;
+using ts_vec = std::vector<ts_t>;
+using cv::gapi::streaming::sync_policy;
+
+ts_t calcLeastCommonMultiple(const ts_vec& values) {
+    ts_t res = *std::max_element(values.begin(), values.end());
+    auto isDivisor = [&](ts_t v) { return res % v == 0; };
+    while(!std::all_of(values.begin(), values.end(), isDivisor)) {
+        res++;
+    }
+    return res;
+}
+
+struct TimestampGenerationParams {
+    const ts_vec frame_times;
+    sync_policy policy;
+    ts_t end_time;
+    TimestampGenerationParams(const ts_vec& ft, sync_policy sp, ts_t et = 25)
+        : frame_times(ft), policy(sp), end_time(et) {
+    }
+};
+
+class MultiFrameSource {
+    class SingleSource : public cv::gapi::wip::IStreamSource {
+        MultiFrameSource& m_source;
+        std::size_t m_idx;
+    public:
+        SingleSource(MultiFrameSource& s, std::size_t idx)
+            : m_source(s)
+            , m_idx(idx)
+        {}
+        virtual bool pull(cv::gapi::wip::Data& data) {
+            return m_source.pull(data, m_idx);
+        }
+        virtual GMetaArg descr_of() const { return GMetaArg{m_source.desc()}; }
+    };
+
+    TimestampGenerationParams p;
+    ts_vec m_current_times;
+    cv::Mat m_mat;
+
+public:
+    MultiFrameSource(const TimestampGenerationParams& params)
+        : p(params)
+        , m_current_times(p.frame_times.size(), 0u)
+        , m_mat(8, 8, CV_8UC1) {
+    }
+
+    bool pull(cv::gapi::wip::Data& data, std::size_t idx) {
+        cv::randn(m_mat, 127, 32);
+        GAPI_Assert(idx < p.frame_times.size());
+        m_current_times[idx] += p.frame_times[idx];
+        if (m_current_times[idx] >= p.end_time) {
+            return false;
+        }
+        data = m_mat.clone();
+        data.meta[cv::gapi::streaming::meta_tag::timestamp] = m_current_times[idx];
+        return true;
+    }
+
+    cv::gapi::wip::IStreamSource::Ptr getSource(std::size_t idx) {
+        return cv::gapi::wip::IStreamSource::Ptr{new SingleSource(*this, idx)};
+    }
+
+    GMatDesc desc() const { return cv::descr_of(m_mat); }
+};
+
+class TimestampChecker {
+    TimestampGenerationParams p;
+    ts_t m_synced_time = 0u;
+    ts_t m_synced_frame_time = 0u;
+public:
+    TimestampChecker(const TimestampGenerationParams& params)
+        : p(params)
+        , m_synced_frame_time(calcLeastCommonMultiple(p.frame_times)) {
+    }
+
+    void checkNext(const ts_vec& timestamps) {
+        if (p.policy == sync_policy::dont_sync) {
+            // don't check timestamps if the policy is dont_sync
+            return;
+        }
+        m_synced_time += m_synced_frame_time;
+        for (const auto& ts : timestamps) {
+            EXPECT_EQ(m_synced_time, ts);
+        }
+    }
+
+    std::size_t nFrames() const {
+        auto frame_time = p.policy == sync_policy::dont_sync
+                          ? *std::max_element(p.frame_times.begin(), p.frame_times.end())
+                          : m_synced_frame_time;
+        auto n_frames = p.end_time / frame_time;
+        GAPI_Assert(n_frames > 0u);
+        return (std::size_t)n_frames;
+    }
+};
+
+struct TimestampSyncTest : public ::testing::TestWithParam<sync_policy> {
+    void run(cv::GProtoInputArgs&& ins, cv::GProtoOutputArgs&& outs,
+             const ts_vec& frame_times) {
+        auto video_in_n = frame_times.size();
+        GAPI_Assert(video_in_n <= ins.m_args.size());
+        // Assume that all remaining inputs are const
+        auto const_in_n = ins.m_args.size() - video_in_n;
+        auto out_n = outs.m_args.size();
+        auto policy = GetParam();
+        TimestampGenerationParams ts_params(frame_times, policy);
+        MultiFrameSource source(ts_params);
+
+        GRunArgs gins;
+        for (std::size_t i = 0; i < video_in_n; i++) {
+            gins += cv::gin(source.getSource(i));
+        }
+        auto desc = source.desc();
+        cv::Mat const_mat = cv::Mat::eye(desc.size.height,
+                                         desc.size.width,
+                                         CV_MAKE_TYPE(desc.depth, desc.chan));
+        for (std::size_t i = 0; i < const_in_n; i++) {
+            gins += cv::gin(const_mat);
+        }
+        ts_vec out_timestamps(out_n);
+        cv::GRunArgsP gouts{};
+        for (auto& t : out_timestamps) {
+            gouts += cv::gout(t);
+        }
+
+        auto pipe = cv::GComputation(std::move(ins), std::move(outs))
+                    .compileStreaming(cv::compile_args(policy));
+
+        pipe.setSource(std::move(gins));
+        pipe.start();
+
+        std::size_t frames = 0u;
+        TimestampChecker checker(ts_params);
+        while(pipe.pull(std::move(gouts))) {
+            checker.checkNext(out_timestamps);
+            frames++;
+        }
+
+        EXPECT_EQ(checker.nFrames(), frames);
+    }
+};
+
+} // anonymous namespace
+
+TEST_P(TimestampSyncTest, Basic)
+{
+    cv::GMat in1, in2;
+    auto out = cv::gapi::add(in1, in2);
+    auto ts = cv::gapi::streaming::timestamp(out);
+
+    run(cv::GIn(in1, in2), cv::GOut(ts), {1,2});
+}
+
+TEST_P(TimestampSyncTest, ThreeInputs)
+{
+    cv::GMat in1, in2, in3;
+    auto tmp = cv::gapi::add(in1, in2);
+    auto out = cv::gapi::add(tmp, in3);
+    auto ts = cv::gapi::streaming::timestamp(out);
+
+    run(cv::GIn(in1, in2, in3), cv::GOut(ts), {2,4,3});
+}
+
+TEST_P(TimestampSyncTest, TwoOutputs)
+{
+    cv::GMat in1, in2, in3;
+    auto out1 = cv::gapi::add(in1, in3);
+    auto out2 = cv::gapi::add(in2, in3);
+    auto ts1 = cv::gapi::streaming::timestamp(out1);
+    auto ts2 = cv::gapi::streaming::timestamp(out2);
+
+    run(cv::GIn(in1, in2, in3), cv::GOut(ts1, ts2), {1,4,2});
+}
+
+TEST_P(TimestampSyncTest, ConstInput)
+{
+    cv::GMat in1, in2, in3;
+    auto out1 = cv::gapi::add(in1, in3);
+    auto out2 = cv::gapi::add(in2, in3);
+    auto ts1 = cv::gapi::streaming::timestamp(out1);
+    auto ts2 = cv::gapi::streaming::timestamp(out2);
+
+    run(cv::GIn(in1, in2, in3), cv::GOut(ts1, ts2), {1,2});
+}
+
+TEST_P(TimestampSyncTest, ChangeSource)
+{
+    cv::GMat in1, in2, in3;
+    auto out1 = cv::gapi::add(in1, in3);
+    auto out2 = cv::gapi::add(in2, in3);
+    auto ts1 = cv::gapi::streaming::timestamp(out1);
+    auto ts2 = cv::gapi::streaming::timestamp(out2);
+
+    run(cv::GIn(in1, in2, in3), cv::GOut(ts1, ts2), {1,2});
+    run(cv::GIn(in1, in2, in3), cv::GOut(ts1, ts2), {1,2});
+}
+
+INSTANTIATE_TEST_CASE_P(InputSynchronization, TimestampSyncTest,
+                        Values(sync_policy::dont_sync,
+                               sync_policy::drop));
+} // namespace opencv_test
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index 8370aee262..064511880e 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -7,6 +7,8 @@
 
 #include "../test_precomp.hpp"
 
+#include "../common/gapi_tests_common.hpp"
+
 #include <thread> // sleep_for (Delay)
 
 #include <opencv2/gapi/cpu/core.hpp>
@@ -21,6 +23,7 @@
 
 #include <opencv2/gapi/streaming/cap.hpp>
 #include <opencv2/gapi/streaming/desync.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
 
 namespace opencv_test
 {
@@ -113,6 +116,111 @@ GAPI_OCV_KERNEL(OCVDelay, Delay) {
     }
 };
 
+class TestMediaBGR final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_mat;
+    using Cb = cv::MediaFrame::View::Callback;
+    Cb m_cb;
+
+    public:
+    explicit TestMediaBGR(cv::Mat m, Cb cb = [](){})
+        : m_mat(m), m_cb(cb) {
+        }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::BGR, cv::Size(m_mat.cols, m_mat.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = { m_mat.ptr(), nullptr, nullptr, nullptr };
+        cv::MediaFrame::View::Strides ss = { m_mat.step, 0u, 0u, 0u };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss), Cb{m_cb});
+    }
+};
+
+class TestMediaNV12 final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_y;
+    cv::Mat m_uv;
+public:
+    TestMediaNV12(cv::Mat y, cv::Mat uv) : m_y(y), m_uv(uv) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::NV12, m_y.size()};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = {
+            m_y.ptr(), m_uv.ptr(), nullptr, nullptr
+        };
+        cv::MediaFrame::View::Strides ss = {
+            m_y.step, m_uv.step, 0u, 0u
+        };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss));
+    }
+};
+
+class BGRSource : public cv::gapi::wip::GCaptureSource {
+public:
+    explicit BGRSource(const std::string& pipeline)
+        : cv::gapi::wip::GCaptureSource(pipeline) {
+    }
+
+    bool pull(cv::gapi::wip::Data& data) {
+        if (cv::gapi::wip::GCaptureSource::pull(data)) {
+            data = cv::MediaFrame::Create<TestMediaBGR>(cv::util::get<cv::Mat>(data));
+            return true;
+        }
+        return false;
+    }
+
+    GMetaArg descr_of() const override {
+        return cv::GMetaArg{cv::GFrameDesc{cv::MediaFormat::BGR,
+                                           cv::util::get<cv::GMatDesc>(
+                                                   cv::gapi::wip::GCaptureSource::descr_of()).size}};
+    }
+};
+
+void cvtBGR2NV12(const cv::Mat& bgr, cv::Mat& y, cv::Mat& uv) {
+    cv::Size frame_sz = bgr.size();
+    cv::Size half_sz  = frame_sz / 2;
+
+    cv::Mat yuv;
+    cv::cvtColor(bgr, yuv, cv::COLOR_BGR2YUV_I420);
+
+    // Copy Y plane
+    yuv.rowRange(0, frame_sz.height).copyTo(y);
+
+    // Merge sampled U and V planes
+    std::vector<int> dims = {half_sz.height, half_sz.width};
+    auto start = frame_sz.height;
+    auto range_h = half_sz.height/2;
+    std::vector<cv::Mat> uv_planes = {
+        yuv.rowRange(start,           start + range_h)  .reshape(0, dims),
+        yuv.rowRange(start + range_h, start + range_h*2).reshape(0, dims)
+    };
+    cv::merge(uv_planes, uv);
+}
+
+class NV12Source : public cv::gapi::wip::GCaptureSource {
+public:
+    explicit NV12Source(const std::string& pipeline)
+        : cv::gapi::wip::GCaptureSource(pipeline) {
+    }
+
+    bool pull(cv::gapi::wip::Data& data) {
+        if (cv::gapi::wip::GCaptureSource::pull(data)) {
+            cv::Mat bgr = cv::util::get<cv::Mat>(data);
+            cv::Mat y, uv;
+            cvtBGR2NV12(bgr, y, uv);
+            data = cv::MediaFrame::Create<TestMediaNV12>(y, uv);
+            return true;
+        }
+        return false;
+    }
+
+    GMetaArg descr_of() const override {
+        return cv::GMetaArg{cv::GFrameDesc{cv::MediaFormat::NV12,
+            cv::util::get<cv::GMatDesc>(
+                    cv::gapi::wip::GCaptureSource::descr_of()).size}};
+    }
+};
+
 } // anonymous namespace
 
 TEST_P(GAPI_Streaming, SmokeTest_ConstInput_GMat)
@@ -996,25 +1104,35 @@ struct GAPI_Streaming_Unit: public ::testing::Test {
 // FIXME: (GAPI_Streaming_Types, XChangeOpaque) test is missing here!
 // FIXME: (GAPI_Streaming_Types,  OutputOpaque) test is missing here!
 
-TEST_F(GAPI_Streaming_Unit, TestTwoVideoSourcesFail)
+TEST(GAPI_Streaming, TestTwoVideosDifferentLength)
 {
-    auto c_desc = cv::GMatDesc{CV_8U,3,{768,576}};
-    auto m_desc = cv::descr_of(m);
-    auto path = findDataFile("cv/video/768x576.avi");
+    initTestDataPath();
+    auto desc = cv::GMatDesc{CV_8U,3,{768,576}};
+    auto path1 = findDataFile("cv/video/768x576.avi");
+    auto path2 = findDataFile("highgui/video/big_buck_bunny.avi");
+
+    cv::GMat in1, in2;
+    auto out = in1 + cv::gapi::resize(in2, desc.size);
+
+    cv::GComputation cc(cv::GIn(in1, in2), cv::GOut(out));
+    auto sc = cc.compileStreaming();
     try {
-        sc = cc.compileStreaming(c_desc, m_desc);
-        // FIXME: it should be EXPECT_NO_THROW()
-        sc.setSource(cv::gin(gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path), m));
-        sc = cc.compileStreaming(m_desc, c_desc);
-        // FIXME: it should be EXPECT_NO_THROW()
-        sc.setSource(cv::gin(m, gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path)));
+        sc.setSource(cv::gin(gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path1),
+                             gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path2)));
     } catch(...) {
-        throw SkipTestException("Video file can not be opened");
+        throw SkipTestException("Video file can not be found");
+    }
+    sc.start();
+
+    cv::Mat out_mat;
+    std::size_t frames = 0u;
+    while(sc.pull(cv::gout(out_mat))) {
+        frames++;
     }
 
-    sc = cc.compileStreaming(c_desc, c_desc);
-    auto c_ptr = gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path);
-    EXPECT_ANY_THROW(sc.setSource(cv::gin(c_ptr, c_ptr)));
+    // big_buck_bunny.avi has 125 frames, 768x576.avi - 100 frames,
+    // expect framework to stop after 100 frames
+    EXPECT_EQ(100u, frames);
 }
 
 TEST_F(GAPI_Streaming_Unit, TestStartWithoutnSetSource)
@@ -1176,7 +1294,7 @@ TEST(Streaming, Python_Pull_Overload)
     cv::Mat in_mat(sz, CV_8UC3);
     cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar(255));
 
-    auto ccomp = c.compileStreaming(cv::descr_of(in_mat));
+    auto ccomp = c.compileStreaming();
 
     EXPECT_TRUE(ccomp);
     EXPECT_FALSE(ccomp.running());
@@ -1247,7 +1365,6 @@ TEST(GAPI_Streaming_Desync, SmokeTest_Streaming)
     }
     EXPECT_EQ(100u, out1_hits);      // out1 must be available for all frames
     EXPECT_LE(out2_hits, out1_hits); // out2 must appear less times than out1
-    std::cout << "Got " << out1_hits << " out1's and " << out2_hits << " out2's" << std::endl;
 }
 
 TEST(GAPI_Streaming_Desync, SmokeTest_Streaming_TwoParts)
@@ -1540,37 +1657,20 @@ TEST(GAPI_Streaming_Desync, StartStop_Stress) {
     }
 }
 
-GAPI_FLUID_KERNEL(FluidCopy, cv::gapi::core::GCopy, false) {
-    static const int Window = 1;
-
-    static void run(const cv::gapi::fluid::View &in,
-                          cv::gapi::fluid::Buffer &out) {
-        const uint8_t *in_ptr = in.InLineB(0);
-        uint8_t *out_ptr = out.OutLineB(0);
-
-        const auto in_type = CV_MAKETYPE(in.meta().depth, in.meta().chan);
-        const auto out_type = CV_MAKETYPE(out.meta().depth, out.meta().chan);
-        GAPI_Assert(in_type == out_type);
-        std::copy_n(in_ptr, in.length()*CV_ELEM_SIZE(in_type), out_ptr);
-    }
-};
-
-
 TEST(GAPI_Streaming_Desync, DesyncObjectConsumedByTwoIslandsViaSeparateDesync) {
     // See comment in the implementation of cv::gapi::streaming::desync (.cpp)
     cv::GMat in;
     cv::GMat tmp = cv::gapi::boxFilter(in, -1, cv::Size(3,3));
 
     cv::GMat tmp1 = cv::gapi::streaming::desync(tmp);
-    cv::GMat out1 = cv::gapi::copy(tmp1); // ran via Fluid backend
+    cv::GMat out1 = cv::gapi::copy(tmp1); // ran via Streaming backend
 
     cv::GMat tmp2 = cv::gapi::streaming::desync(tmp);
     cv::GMat out2 = tmp2 * 0.5;           // ran via OCV backend
 
     auto c = cv::GComputation(cv::GIn(in), cv::GOut(out1, out2));
-    auto p = cv::gapi::kernels<FluidCopy>();
 
-    EXPECT_NO_THROW(c.compileStreaming(cv::compile_args(p)));
+    EXPECT_NO_THROW(c.compileStreaming());
 }
 
 TEST(GAPI_Streaming_Desync, DesyncObjectConsumedByTwoIslandsViaSameDesync) {
@@ -1579,13 +1679,440 @@ TEST(GAPI_Streaming_Desync, DesyncObjectConsumedByTwoIslandsViaSameDesync) {
     cv::GMat tmp = cv::gapi::boxFilter(in, -1, cv::Size(3,3));
 
     cv::GMat tmp1 = cv::gapi::streaming::desync(tmp);
-    cv::GMat out1 = cv::gapi::copy(tmp1); // ran via Fluid backend
+    cv::GMat out1 = cv::gapi::copy(tmp1); // ran via Streaming backend
     cv::GMat out2 = out1 - 0.5*tmp1;      // ran via OCV backend
 
     auto c = cv::GComputation(cv::GIn(in), cv::GOut(out1, out2));
-    auto p = cv::gapi::kernels<FluidCopy>();
 
-    EXPECT_NO_THROW(c.compileStreaming(cv::compile_args(p)));
+    EXPECT_NO_THROW(c.compileStreaming());
+}
+
+TEST(GAPI_Streaming, CopyFrame)
+{
+    initTestDataPath();
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::GFrame in;
+    auto out = cv::gapi::copy(in);
+
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    auto cc = comp.compileStreaming();
+    try {
+        cc.setSource<BGRSource>(filepath);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::MediaFrame frame;
+    cv::Mat ocv_mat;
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cc.start();
+    while (cc.pull(cv::gout(frame)) && num_frames < max_frames)
+    {
+        auto view = frame.access(cv::MediaFrame::Access::R);
+        cv::Mat gapi_mat(frame.desc().size, CV_8UC3, view.ptr[0]);
+        num_frames++;
+        cap >> ocv_mat;
+
+        EXPECT_EQ(0, cvtest::norm(ocv_mat, gapi_mat, NORM_INF));
+    }
+}
+
+TEST(GAPI_Streaming, CopyMat)
+{
+    initTestDataPath();
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::GMat in;
+    auto out = cv::gapi::copy(in);
+
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    auto cc = comp.compileStreaming();
+    try {
+        cc.setSource<cv::gapi::wip::GCaptureSource>(filepath);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::Mat out_mat;
+    cv::Mat ocv_mat;
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cc.start();
+    while (cc.pull(cv::gout(out_mat)) && num_frames < max_frames)
+    {
+        num_frames++;
+        cap >> ocv_mat;
+
+        EXPECT_EQ(0, cvtest::norm(ocv_mat, out_mat, NORM_INF));
+    }
+}
+
+TEST(GAPI_Streaming, Reshape)
+{
+    initTestDataPath();
+    std::string filepath = findDataFile("cv/video/768x576.avi");
+
+    cv::GFrame in;
+    auto out = cv::gapi::copy(in);
+
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    auto cc = comp.compileStreaming();
+    try {
+        cc.setSource<BGRSource>(filepath);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    cv::VideoCapture cap;
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::MediaFrame frame;
+    cv::Mat ocv_mat;
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cc.start();
+    while (cc.pull(cv::gout(frame)) && num_frames < max_frames)
+    {
+        auto view = frame.access(cv::MediaFrame::Access::R);
+        cv::Mat gapi_mat(frame.desc().size, CV_8UC3, view.ptr[0]);
+        num_frames++;
+        cap >> ocv_mat;
+
+        EXPECT_EQ(0, cvtest::norm(ocv_mat, gapi_mat, NORM_INF));
+    }
+
+    // Reshape the graph meta
+    filepath = findDataFile("cv/video/1920x1080.avi");
+    cc.stop();
+    try {
+        cc.setSource<BGRSource>(filepath);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    cap.open(filepath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::MediaFrame frame2;
+    cv::Mat ocv_mat2;
+
+    num_frames = 0u;
+
+    cc.start();
+    while (cc.pull(cv::gout(frame2)) && num_frames < max_frames)
+    {
+        auto view = frame2.access(cv::MediaFrame::Access::R);
+        cv::Mat gapi_mat(frame2.desc().size, CV_8UC3, view.ptr[0]);
+        num_frames++;
+        cap >> ocv_mat2;
+
+        EXPECT_EQ(0, cvtest::norm(ocv_mat2, gapi_mat, NORM_INF));
+    }
+}
+
+namespace {
+    enum class TestSourceType {
+        BGR,
+        NV12
+    };
+    std::ostream& operator<<(std::ostream& os, TestSourceType a) {
+        os << "Source:";
+        switch (a) {
+            case TestSourceType::BGR:  return os << "BGR";
+            case TestSourceType::NV12: return os << "NV12";
+            default: CV_Assert(false && "unknown TestSourceType");
+        }
+    }
+
+    cv::gapi::wip::IStreamSource::Ptr createTestSource(TestSourceType sourceType,
+                                                       const std::string& pipeline) {
+        assert(sourceType == TestSourceType::BGR || sourceType == TestSourceType::NV12);
+
+        cv::gapi::wip::IStreamSource::Ptr ptr { };
+
+        switch (sourceType) {
+            case TestSourceType::BGR: {
+                try {
+                    ptr = cv::gapi::wip::make_src<BGRSource>(pipeline);
+                }
+                catch(...) {
+                    throw SkipTestException(std::string("BGRSource for '") + pipeline +
+                                            "' couldn't be created!");
+                }
+                break;
+            }
+            case TestSourceType::NV12: {
+                try {
+                    ptr = cv::gapi::wip::make_src<NV12Source>(pipeline);
+                }
+                catch(...) {
+                    throw SkipTestException(std::string("NV12Source for '") + pipeline +
+                                            "' couldn't be created!");
+                }
+                break;
+            }
+            default: {
+                throw SkipTestException("Incorrect type of source! "
+                                        "Something went wrong in the test!");
+            }
+        }
+
+        return ptr;
+    }
+
+    enum class TestAccessType {
+        BGR,
+        Y,
+        UV
+    };
+    std::ostream& operator<<(std::ostream& os, TestAccessType a) {
+        os << "Accessor:";
+        switch (a) {
+            case TestAccessType::BGR: return os << "BGR";
+            case TestAccessType::Y:   return os << "Y";
+            case TestAccessType::UV:  return os << "UV";
+            default: CV_Assert(false && "unknown TestAccessType");
+        }
+    }
+
+    using GapiFunction = std::function<cv::GMat(const cv::GFrame&)>;
+    static std::map<TestAccessType, GapiFunction> gapi_functions = {
+        { TestAccessType::BGR, cv::gapi::streaming::BGR },
+        { TestAccessType::Y,   cv::gapi::streaming::Y   },
+        { TestAccessType::UV,  cv::gapi::streaming::UV  }
+    };
+
+    using RefFunction = std::function<cv::Mat(const cv::Mat&)>;
+    static std::map<std::pair<TestSourceType,TestAccessType>, RefFunction> ref_functions = {
+        { std::make_pair(TestSourceType::BGR, TestAccessType::BGR),
+          [](const cv::Mat& bgr) { return bgr; } },
+        { std::make_pair(TestSourceType::BGR, TestAccessType::Y),
+          [](const cv::Mat& bgr) {
+              cv::Mat y, uv;
+              cvtBGR2NV12(bgr, y, uv);
+              return y;
+          } },
+        { std::make_pair(TestSourceType::BGR, TestAccessType::UV),
+          [](const cv::Mat& bgr) {
+              cv::Mat y, uv;
+              cvtBGR2NV12(bgr, y, uv);
+              return uv;
+          } },
+        { std::make_pair(TestSourceType::NV12, TestAccessType::BGR),
+          [](const cv::Mat& bgr) {
+              cv::Mat y, uv, out_bgr;
+              cvtBGR2NV12(bgr, y, uv);
+              cv::cvtColorTwoPlane(y, uv, out_bgr,
+                                   cv::COLOR_YUV2BGR_NV12);
+              return out_bgr;
+          } },
+        { std::make_pair(TestSourceType::NV12, TestAccessType::Y),
+          [](const cv::Mat& bgr) {
+              cv::Mat y, uv;
+              cvtBGR2NV12(bgr, y, uv);
+              return y;
+          } },
+        { std::make_pair(TestSourceType::NV12, TestAccessType::UV),
+          [](const cv::Mat& bgr) {
+              cv::Mat y, uv;
+              cvtBGR2NV12(bgr, y, uv);
+              return uv;
+          } },
+    };
+} // anonymous namespace
+
+struct GAPI_Accessors_In_Streaming : public TestWithParam<
+    std::tuple<std::string,TestSourceType,TestAccessType>>
+{ };
+
+TEST_P(GAPI_Accessors_In_Streaming, AccuracyTest)
+{
+    std::string filepath{};
+    TestSourceType sourceType = TestSourceType::BGR;
+    TestAccessType accessType = TestAccessType::BGR;
+    std::tie(filepath, sourceType, accessType) = GetParam();
+    auto accessor = gapi_functions[accessType];
+    auto fromBGR = ref_functions[std::make_pair(sourceType, accessType)];
+
+    initTestDataPathOrSkip();
+    const std::string& absFilePath = findDataFile(filepath, false);
+
+    cv::GFrame in;
+    cv::GMat out = accessor(in);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    auto cc = comp.compileStreaming();
+    auto src = createTestSource(sourceType, absFilePath);
+    cc.setSource(src);
+
+    cv::VideoCapture cap;
+    cap.open(absFilePath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::Mat cap_mat, ocv_mat, gapi_mat;
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cc.start();
+    while (num_frames < max_frames && cc.pull(cv::gout(gapi_mat)))
+    {
+        num_frames++;
+        cap >> cap_mat;
+        ocv_mat = fromBGR(cap_mat);
+
+        EXPECT_EQ(0, cvtest::norm(ocv_mat, gapi_mat, NORM_INF));
+    }
+
+    cc.stop();
+}
+
+INSTANTIATE_TEST_CASE_P(TestAccessor, GAPI_Accessors_In_Streaming,
+                        Combine(Values("cv/video/768x576.avi"),
+                                Values(TestSourceType::BGR, TestSourceType::NV12),
+                                Values(TestAccessType::BGR, TestAccessType::Y, TestAccessType::UV)
+                        ));
+
+struct GAPI_Accessors_Meta_In_Streaming : public TestWithParam<
+    std::tuple<std::string,TestSourceType,TestAccessType>>
+{ };
+
+TEST_P(GAPI_Accessors_Meta_In_Streaming, AccuracyTest)
+{
+    std::string filepath{};
+    TestSourceType sourceType = TestSourceType::BGR;
+    TestAccessType accessType = TestAccessType::BGR;
+    std::tie(filepath, sourceType, accessType) = GetParam();
+    auto accessor = gapi_functions[accessType];
+    auto fromBGR = ref_functions[std::make_pair(sourceType, accessType)];
+
+    initTestDataPathOrSkip();
+    const std::string& absFilePath = findDataFile(filepath, false);
+
+    cv::GFrame in;
+    cv::GMat gmat = accessor(in);
+    cv::GMat resized = cv::gapi::resize(gmat, cv::Size(1920, 1080));
+    cv::GOpaque<int64_t> outId = cv::gapi::streaming::seq_id(resized);
+    cv::GOpaque<int64_t> outTs = cv::gapi::streaming::timestamp(resized);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(resized, outId, outTs));
+
+    auto cc = comp.compileStreaming();
+    auto src = createTestSource(sourceType, absFilePath);
+    cc.setSource(src);
+
+    cv::VideoCapture cap;
+    cap.open(absFilePath);
+    if (!cap.isOpened())
+        throw SkipTestException("Video file can not be opened");
+
+    cv::Mat cap_mat, req_mat, ocv_mat, gapi_mat;
+    int64_t seq_id = 0, timestamp = 0;
+    std::set<int64_t> all_seq_ids;
+    std::vector<int64_t> all_timestamps;
+
+    std::size_t num_frames = 0u;
+    std::size_t max_frames = 10u;
+
+    cc.start();
+    while (num_frames < max_frames && cc.pull(cv::gout(gapi_mat, seq_id, timestamp)))
+    {
+        num_frames++;
+
+        cap >> cap_mat;
+        req_mat = fromBGR(cap_mat);
+        cv::resize(req_mat, ocv_mat, cv::Size(1920, 1080));
+        EXPECT_EQ(0, cvtest::norm(ocv_mat, gapi_mat, NORM_INF));
+
+        all_seq_ids.insert(seq_id);
+        all_timestamps.push_back(timestamp);
+    }
+
+    cc.stop();
+
+    EXPECT_EQ(all_seq_ids.begin(), all_seq_ids.find(0L));
+    auto last_elem_it = --all_seq_ids.end();
+    EXPECT_EQ(last_elem_it, all_seq_ids.find(int64_t(max_frames - 1L)));
+    EXPECT_EQ(max_frames, all_seq_ids.size());
+
+    EXPECT_EQ(max_frames, all_timestamps.size());
+    EXPECT_TRUE(std::is_sorted(all_timestamps.begin(), all_timestamps.end()));
+}
+
+INSTANTIATE_TEST_CASE_P(AccessorMeta, GAPI_Accessors_Meta_In_Streaming,
+                        Combine(Values("cv/video/768x576.avi"),
+                                Values(TestSourceType::BGR, TestSourceType::NV12),
+                                Values(TestAccessType::BGR, TestAccessType::Y, TestAccessType::UV)
+                        ));
+
+TEST(GAPI_Streaming, TestPythonAPI)
+{
+    cv::Size sz(200, 200);
+    cv::Mat in_mat(sz, CV_8UC3);
+    cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar(255));
+    const auto crop_rc = cv::Rect(13, 75, 100, 100);
+
+    // OpenCV reference image
+    cv::Mat ocv_mat;
+    {
+        ocv_mat = in_mat(crop_rc);
+    }
+
+    cv::GMat in;
+    auto roi = cv::gapi::crop(in, crop_rc);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(roi));
+
+    // NB: Used by python bridge
+    auto cc = comp.compileStreaming(cv::detail::ExtractMetaCallback{[&](const cv::GTypesInfo& info)
+            {
+                GAPI_Assert(info.size() == 1u);
+                GAPI_Assert(info[0].shape == cv::GShape::GMAT);
+                return cv::GMetaArgs{cv::GMetaArg{cv::descr_of(in_mat)}};
+            }});
+
+    // NB: Used by python bridge
+    cc.setSource(cv::detail::ExtractArgsCallback{[&](const cv::GTypesInfo& info)
+            {
+                GAPI_Assert(info.size() == 1u);
+                GAPI_Assert(info[0].shape == cv::GShape::GMAT);
+                return cv::GRunArgs{in_mat};
+            }});
+
+    cc.start();
+
+    bool is_over = false;
+    cv::GRunArgs out_args;
+
+    // NB: Used by python bridge
+    std::tie(is_over, out_args) = cc.pull();
+
+    ASSERT_EQ(1u, out_args.size());
+    ASSERT_TRUE(cv::util::holds_alternative<cv::Mat>(out_args[0]));
+
+    EXPECT_EQ(0, cvtest::norm(ocv_mat, cv::util::get<cv::Mat>(out_args[0]), NORM_INF));
+    EXPECT_TRUE(is_over);
+
+    cc.stop();
 }
 
 } // namespace opencv_test
diff --git a/modules/gapi/test/test_precomp.hpp b/modules/gapi/test/test_precomp.hpp
index 7b3c695443..e92b1d03bf 100644
--- a/modules/gapi/test/test_precomp.hpp
+++ b/modules/gapi/test/test_precomp.hpp
@@ -11,6 +11,7 @@
 #define __OPENCV_GAPI_TEST_PRECOMP_HPP__
 
 #include <cstdint>
+#include <thread>
 #include <vector>
 
 #include <opencv2/ts.hpp>
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index d7a3c220d1..7a546616a4 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -148,7 +148,7 @@ ocv_create_module(${HIGHGUI_LIBRARIES})
 
 macro(ocv_highgui_configure_target)
 if(APPLE)
-  add_apple_compiler_options(the_module)
+  add_apple_compiler_options(${the_module})
 endif()
 
 if(MSVC)
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index c5b5ed42c2..cdab0ada78 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -202,7 +202,8 @@ enum WindowPropertyFlags {
        WND_PROP_ASPECT_RATIO = 2, //!< window's aspect ration (can be set to WINDOW_FREERATIO or WINDOW_KEEPRATIO).
        WND_PROP_OPENGL       = 3, //!< opengl support.
        WND_PROP_VISIBLE      = 4, //!< checks whether the window exists and is visible
-       WND_PROP_TOPMOST      = 5  //!< property to toggle normal window being topmost or not
+       WND_PROP_TOPMOST      = 5, //!< property to toggle normal window being topmost or not
+       WND_PROP_VSYNC        = 6  //!< enable or disable VSYNC (in OpenGL mode)
      };
 
 //! Mouse Events see cv::MouseCallback
@@ -347,23 +348,34 @@ The function waitKey waits for a key event infinitely (when \f$\texttt{delay}\le
 milliseconds, when it is positive. Since the OS has a minimum time between switching threads, the
 function will not wait exactly delay ms, it will wait at least delay ms, depending on what else is
 running on your computer at that time. It returns the code of the pressed key or -1 if no key was
-pressed before the specified time had elapsed.
+pressed before the specified time had elapsed. To check for a key press but not wait for it, use
+#pollKey.
 
-@note
+@note The functions #waitKey and #pollKey are the only methods in HighGUI that can fetch and handle
+GUI events, so one of them needs to be called periodically for normal event processing unless
+HighGUI is used within an environment that takes care of event processing.
 
-This function is the only method in HighGUI that can fetch and handle events, so it needs to be
-called periodically for normal event processing unless HighGUI is used within an environment that
-takes care of event processing.
-
-@note
-
-The function only works if there is at least one HighGUI window created and the window is active.
-If there are several HighGUI windows, any of them can be active.
+@note The function only works if there is at least one HighGUI window created and the window is
+active. If there are several HighGUI windows, any of them can be active.
 
 @param delay Delay in milliseconds. 0 is the special value that means "forever".
  */
 CV_EXPORTS_W int waitKey(int delay = 0);
 
+/** @brief Polls for a pressed key.
+
+The function pollKey polls for a key event without waiting. It returns the code of the pressed key
+or -1 if no key was pressed since the last invocation. To wait until a key was pressed, use #waitKey.
+
+@note The functions #waitKey and #pollKey are the only methods in HighGUI that can fetch and handle
+GUI events, so one of them needs to be called periodically for normal event processing unless
+HighGUI is used within an environment that takes care of event processing.
+
+@note The function only works if there is at least one HighGUI window created and the window is
+active. If there are several HighGUI windows, any of them can be active.
+ */
+CV_EXPORTS_W int pollKey();
+
 /** @brief Displays an image in the specified window.
 
 The function imshow displays an image in the specified window. If the window was created with the
@@ -383,11 +395,12 @@ If the window was not created before this function, it is assumed creating a win
 
 If you need to show an image that is bigger than the screen resolution, you will need to call namedWindow("", WINDOW_NORMAL) before the imshow.
 
-@note This function should be followed by cv::waitKey function which displays the image for specified
-milliseconds. Otherwise, it won't display the image. For example, **waitKey(0)** will display the window
-infinitely until any keypress (it is suitable for image display). **waitKey(25)** will display a frame
-for 25 ms, after which display will be automatically closed. (If you put it in a loop to read
-videos, it will display the video frame-by-frame)
+@note This function should be followed by a call to cv::waitKey or cv::pollKey to perform GUI
+housekeeping tasks that are necessary to actually show the given image and make the window respond
+to mouse and keyboard events. Otherwise, it won't display the image and the window might lock up.
+For example, **waitKey(0)** will display the window infinitely until any keypress (it is suitable
+for image display). **waitKey(25)** will display a frame and wait approximately 25 ms for a key
+press (suitable for displaying a video frame-by-frame). To remove the window, use cv::destroyWindow.
 
 @note
 
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 6ffd154a19..a5b176c9dd 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -115,6 +115,9 @@ double cvGetPropTopmost_COCOA(const char* name);
 void cvSetPropTopmost_W32(const char* name, const bool topmost);
 void cvSetPropTopmost_COCOA(const char* name, const bool topmost);
 
+double cvGetPropVsync_W32(const char* name);
+void cvSetPropVsync_W32(const char* name, const bool enabled);
+
 //for QT
 #if defined (HAVE_QT)
 CvRect cvGetWindowRect_QT(const char* name);
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 8b26ea989b..d2cf1e1e48 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -92,6 +92,16 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu
         #endif
     break;
 
+    case cv::WND_PROP_VSYNC:
+        #if defined (HAVE_QT)
+            // nothing
+        #elif defined (HAVE_WIN32UI)
+            cvSetPropVsync_W32(name, (prop_value != 0));
+        #else
+            // not implemented yet for other toolkits
+        #endif
+    break;
+
     default:;
     }
 }
@@ -182,6 +192,16 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
         #endif
     break;
 
+    case cv::WND_PROP_VSYNC:
+        #if defined (HAVE_QT)
+            return -1;
+        #elif defined (HAVE_WIN32UI)
+            return cvGetPropVsync_W32(name);
+        #else
+            return -1;
+        #endif
+    break;
+
     default:
         return -1;
     }
@@ -281,6 +301,18 @@ int cv::waitKey(int delay)
     return (code != -1) ? (code & 0xff) : -1;
 }
 
+#if defined(HAVE_WIN32UI)
+// pollKey() implemented in window_w32.cpp
+#elif defined(HAVE_GTK) || defined(HAVE_COCOA) || defined(HAVE_QT) || (defined (WINRT) && !defined (WINRT_8_0))
+// pollKey() fallback implementation
+int cv::pollKey()
+{
+    CV_TRACE_FUNCTION();
+    // fallback. please implement a proper polling function
+    return cvWaitKey(1);
+}
+#endif
+
 int cv::createTrackbar(const String& trackbarName, const String& winName,
                    int* value, int count, TrackbarCallback callback,
                    void* userdata)
@@ -771,6 +803,10 @@ CV_IMPL int cvCreateButton(const char*, void (*)(int, void*), void*, int, int)
     CV_NO_GUI_ERROR("cvCreateButton");
 }
 
+int cv::pollKey()
+{
+    CV_NO_GUI_ERROR("cv::pollKey()");
+}
 
 #endif
 
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 8a979b949d..c4f2ddd2a6 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -249,7 +249,7 @@ CV_IMPL int cvInitSystem( int, char** )
     // check initialization status
     if( !wasInitialized )
     {
-        // Initialize the stogare
+        // Initialize the storage
         hg_windows = 0;
 
         // Register the class
@@ -596,6 +596,89 @@ void cvSetPropTopmost_W32(const char* name, const bool topmost)
     }
 }
 
+double cvGetPropVsync_W32(const char* name)
+{
+#ifndef HAVE_OPENGL
+    CV_UNUSED(name);
+    CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
+#else
+    if (!name)
+        CV_Error(Error::StsNullPtr, "'name' argument must not be NULL");
+
+    CvWindow* window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsBadArg, ("there is no window named '%s'", name));
+
+    // https://www.khronos.org/opengl/wiki/Swap_Interval
+    // https://www.khronos.org/registry/OpenGL/extensions/EXT/WGL_EXT_extensions_string.txt
+    // https://www.khronos.org/registry/OpenGL/extensions/EXT/WGL_EXT_swap_control.txt
+
+    if (!wglMakeCurrent(window->dc, window->hGLRC))
+        CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
+
+    typedef const char* (APIENTRY* PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
+    PFNWGLGETEXTENSIONSSTRINGEXTPROC wglGetExtensionsString = NULL;
+    wglGetExtensionsString = (PFNWGLGETEXTENSIONSSTRINGEXTPROC)wglGetProcAddress("wglGetExtensionsStringEXT");
+    if (wglGetExtensionsString == NULL)
+        return -1; // wglGetProcAddress failed to get wglGetExtensionsStringEXT
+
+    const char* wgl_extensions = wglGetExtensionsString();
+    if (wgl_extensions == NULL)
+        return -1; // Can't get WGL extensions string
+
+    if (strstr(wgl_extensions, "WGL_EXT_swap_control") == NULL)
+        return -1; // WGL extensions don't contain WGL_EXT_swap_control
+
+    typedef int (APIENTRY* PFNWGLGETSWAPINTERVALPROC)(void);
+    PFNWGLGETSWAPINTERVALPROC wglGetSwapInterval = 0;
+    wglGetSwapInterval = (PFNWGLGETSWAPINTERVALPROC)wglGetProcAddress("wglGetSwapIntervalEXT");
+    if (wglGetSwapInterval == NULL)
+        return -1; // wglGetProcAddress failed to get wglGetSwapIntervalEXT
+
+    return wglGetSwapInterval();
+#endif
+}
+
+void cvSetPropVsync_W32(const char* name, const bool enable_vsync)
+{
+#ifndef HAVE_OPENGL
+    CV_UNUSED(name);
+    CV_UNUSED(enable_vsync);
+    CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
+#else
+    if (!name)
+        CV_Error(Error::StsNullPtr, "'name' argument must not be NULL");
+
+    CvWindow* window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsBadArg, ("there is no window named '%s'", name));
+
+    if (!wglMakeCurrent(window->dc, window->hGLRC))
+        CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
+
+    typedef const char* (APIENTRY* PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
+    PFNWGLGETEXTENSIONSSTRINGEXTPROC wglGetExtensionsString = NULL;
+    wglGetExtensionsString = (PFNWGLGETEXTENSIONSSTRINGEXTPROC)wglGetProcAddress("wglGetExtensionsStringEXT");
+    if (wglGetExtensionsString == NULL)
+        CV_Error(Error::OpenGlApiCallError, "wglGetProcAddress failed to get wglGetExtensionsStringEXT");
+
+    const char* wgl_extensions = wglGetExtensionsString();
+    if (wgl_extensions == NULL)
+        CV_Error(Error::OpenGlApiCallError, "Can't get WGL extensions string");
+
+    if (strstr(wgl_extensions, "WGL_EXT_swap_control") == NULL)
+        CV_Error(Error::OpenGlApiCallError, "WGL extensions don't contain WGL_EXT_swap_control");
+
+    typedef BOOL(APIENTRY* PFNWGLSWAPINTERVALPROC)(int);
+    PFNWGLSWAPINTERVALPROC wglSwapInterval = 0;
+    wglSwapInterval = (PFNWGLSWAPINTERVALPROC)wglGetProcAddress("wglSwapIntervalEXT");
+    if (wglSwapInterval == NULL)
+        CV_Error(Error::OpenGlApiCallError, "wglGetProcAddress failed to get wglSwapIntervalEXT");
+
+    wglSwapInterval(enable_vsync);
+#endif
+}
+
 void cv::setWindowTitle(const String& winname, const String& title)
 {
     CvWindow* window = icvFindWindowByName(winname.c_str());
@@ -1118,26 +1201,20 @@ static void icvScreenToClient( HWND hwnd, RECT* rect )
 /* Calculatess the window coordinates relative to the upper left corner of the mainhWnd window */
 static RECT icvCalcWindowRect( CvWindow* window )
 {
-    const int gutter = 1;
-    RECT crect = { 0 }, trect = { 0 } , rect = { 0 };
+    RECT crect = { 0 }, trect = { 0 }, rect = { 0 };
 
     assert(window);
 
     GetClientRect(window->frame, &crect);
-    if(window->toolbar.toolbar)
+    if (window->toolbar.toolbar)
     {
         GetWindowRect(window->toolbar.toolbar, &trect);
         icvScreenToClient(window->frame, &trect);
-        SubtractRect( &rect, &crect, &trect);
+        SubtractRect(&rect, &crect, &trect);
     }
     else
         rect = crect;
 
-    rect.top += gutter;
-    rect.left += gutter;
-    rect.bottom -= gutter;
-    rect.right -= gutter;
-
     return rect;
 }
 
@@ -1987,6 +2064,97 @@ static void showSaveDialog(CvWindow* window)
     }
 }
 
+/*
+ * message received. check if it belongs to our windows (frame, hwnd).
+ * returns true (and value in keyCode) if a key was pressed.
+ * otherwise returns false (indication to continue event loop).
+ */
+static bool handleMessage(MSG& message, int& keyCode)
+{
+    // whether we have to call translate and dispatch yet
+    // otherwise the message was handled specifically
+    bool is_processed = false;
+
+    for (CvWindow* window = hg_windows; window != 0 && is_processed == 0; window = window->next)
+    {
+        if (!(window->hwnd == message.hwnd || window->frame == message.hwnd))
+            continue;
+
+        is_processed = true;
+        switch (message.message)
+        {
+            case WM_DESTROY:
+            case WM_CHAR:
+                DispatchMessage(&message);
+                keyCode = (int)message.wParam;
+                return true;
+
+            case WM_SYSKEYDOWN:
+                if (message.wParam == VK_F10)
+                {
+                    is_processed = true;
+                    keyCode = (int)(message.wParam << 16);
+                    return true;
+                }
+                break;
+
+            case WM_KEYDOWN:
+                TranslateMessage(&message);
+                if ((message.wParam >= VK_F1 && message.wParam <= VK_F24)      ||
+                    message.wParam == VK_HOME   || message.wParam == VK_END    ||
+                    message.wParam == VK_UP     || message.wParam == VK_DOWN   ||
+                    message.wParam == VK_LEFT   || message.wParam == VK_RIGHT  ||
+                    message.wParam == VK_INSERT || message.wParam == VK_DELETE ||
+                    message.wParam == VK_PRIOR  || message.wParam == VK_NEXT)
+                {
+                    DispatchMessage(&message);
+                    is_processed = true;
+                    keyCode = (int)(message.wParam << 16);
+                    return true;
+                }
+
+                // Intercept Ctrl+C for copy to clipboard
+                if ('C' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
+                    ::SendMessage(message.hwnd, WM_COPY, 0, 0);
+
+                // Intercept Ctrl+S for "save as" dialog
+                if ('S' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
+                    showSaveDialog(window);
+
+            default:
+                DispatchMessage(&message);
+                is_processed = true;
+                break;
+        }
+    }
+
+    if (!is_processed)
+    {
+        TranslateMessage(&message);
+        DispatchMessage(&message);
+    }
+
+    return false; // no value to return, keep processing
+}
+
+/*
+ * process until queue is empty but don't wait.
+ */
+int cv::pollKey()
+{
+    CV_TRACE_FUNCTION();
+    for(;;)
+    {
+        MSG message;
+        if (PeekMessage(&message, 0, 0, 0, PM_REMOVE) == FALSE)
+            return -1;
+
+        int keyCode = -1;
+        if (handleMessage(message, keyCode))
+            return keyCode;
+    }
+}
+
 CV_IMPL int
 cvWaitKey( int delay )
 {
@@ -1995,9 +2163,7 @@ cvWaitKey( int delay )
 
     for(;;)
     {
-        CvWindow* window;
         MSG message;
-        int is_processed = 0;
 
         if( (delay <= 0) && hg_windows)
             GetMessage(&message, 0, 0, 0);
@@ -2010,61 +2176,9 @@ cvWaitKey( int delay )
             continue;
         }
 
-        for( window = hg_windows; window != 0 && is_processed == 0; window = window->next )
-        {
-            if( window->hwnd == message.hwnd || window->frame == message.hwnd )
-            {
-                is_processed = 1;
-                switch(message.message)
-                {
-                case WM_DESTROY:
-                case WM_CHAR:
-                    DispatchMessage(&message);
-                    return (int)message.wParam;
-
-                case WM_SYSKEYDOWN:
-                    if( message.wParam == VK_F10 )
-                    {
-                        is_processed = 1;
-                        return (int)(message.wParam << 16);
-                    }
-                    break;
-
-                case WM_KEYDOWN:
-                    TranslateMessage(&message);
-                    if( (message.wParam >= VK_F1 && message.wParam <= VK_F24)       ||
-                        message.wParam == VK_HOME   || message.wParam == VK_END     ||
-                        message.wParam == VK_UP     || message.wParam == VK_DOWN    ||
-                        message.wParam == VK_LEFT   || message.wParam == VK_RIGHT   ||
-                        message.wParam == VK_INSERT || message.wParam == VK_DELETE  ||
-                        message.wParam == VK_PRIOR  || message.wParam == VK_NEXT )
-                    {
-                        DispatchMessage(&message);
-                        is_processed = 1;
-                        return (int)(message.wParam << 16);
-                    }
-
-                    // Intercept Ctrl+C for copy to clipboard
-                    if ('C' == message.wParam && (::GetKeyState(VK_CONTROL)>>15))
-                        ::SendMessage(message.hwnd, WM_COPY, 0, 0);
-
-                    // Intercept Ctrl+S for "save as" dialog
-                    if ('S' == message.wParam && (::GetKeyState(VK_CONTROL)>>15))
-                        showSaveDialog(window);
-
-                default:
-                    DispatchMessage(&message);
-                    is_processed = 1;
-                    break;
-                }
-            }
-        }
-
-        if( !is_processed )
-        {
-            TranslateMessage(&message);
-            DispatchMessage(&message);
-        }
+        int keyCode = -1;
+        if (handleMessage(message, keyCode))
+            return keyCode;
     }
 }
 
diff --git a/modules/imgcodecs/CMakeLists.txt b/modules/imgcodecs/CMakeLists.txt
index 8ae85e62c5..24f89e6e38 100644
--- a/modules/imgcodecs/CMakeLists.txt
+++ b/modules/imgcodecs/CMakeLists.txt
@@ -13,11 +13,6 @@ if(HAVE_WINRT_CX AND NOT WINRT)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW")
 endif()
 
-if(HAVE_PNG OR HAVE_TIFF OR HAVE_OPENEXR)
-  ocv_include_directories(${ZLIB_INCLUDE_DIRS})
-  list(APPEND GRFMT_LIBS ${ZLIB_LIBRARIES})
-endif()
-
 if(HAVE_JPEG)
   ocv_include_directories(${JPEG_INCLUDE_DIR} ${${JPEG_LIBRARY}_BINARY_DIR})
   list(APPEND GRFMT_LIBS ${JPEG_LIBRARIES})
@@ -63,6 +58,11 @@ if(HAVE_OPENEXR)
   list(APPEND GRFMT_LIBS ${OPENEXR_LIBRARIES})
 endif()
 
+if(HAVE_PNG OR HAVE_TIFF OR HAVE_OPENEXR)
+  ocv_include_directories(${ZLIB_INCLUDE_DIRS})
+  list(APPEND GRFMT_LIBS ${ZLIB_LIBRARIES})
+endif()
+
 if(HAVE_GDAL)
   include_directories(SYSTEM ${GDAL_INCLUDE_DIR})
   list(APPEND GRFMT_LIBS ${GDAL_LIBRARY})
@@ -149,7 +149,7 @@ ocv_create_module(${GRFMT_LIBS} ${IMGCODECS_LIBRARIES})
 
 macro(ocv_imgcodecs_configure_target)
 if(APPLE)
-  add_apple_compiler_options(the_module)
+  add_apple_compiler_options(${the_module})
 endif()
 
 if(MSVC)
@@ -167,4 +167,8 @@ ocv_add_accuracy_tests()
 if(TARGET opencv_test_imgcodecs AND HAVE_JASPER AND "$ENV{OPENCV_IO_ENABLE_JASPER}")
   ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_ENABLE_JASPER_TESTS=1)
 endif()
+if(TARGET opencv_test_imgcodecs AND HAVE_PNG AND NOT (PNG_VERSION VERSION_LESS "1.6.31"))
+  # details: https://github.com/glennrp/libpng/commit/68cb0aaee3de6371b81a4613476d9b33e43e95b1
+  ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_PNG_WITH_EXIF=1)
+endif()
 ocv_add_perf_tests()
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 25c678d89d..6a389fd471 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -95,6 +95,7 @@ enum ImwriteFlags {
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
        IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
        IMWRITE_EXR_TYPE            = (3 << 4) + 0, /* 48 */ //!< override EXR storage type (FLOAT (FP32) is default)
+       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1, /* 49 */ //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
        IMWRITE_WEBP_QUALITY        = 64, //!< For WEBP, it can be a quality from 1 to 100 (the higher is the better). By default (without any parameter) and for quality above 100 the lossless compression is used.
        IMWRITE_PAM_TUPLETYPE       = 128,//!< For PAM, sets the TUPLETYPE field to the corresponding string value that is defined for the format
        IMWRITE_TIFF_RESUNIT = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values
@@ -110,6 +111,19 @@ enum ImwriteEXRTypeFlags {
        IMWRITE_EXR_TYPE_FLOAT = 2   //!< store as FP32 (default)
      };
 
+enum ImwriteEXRCompressionFlags {
+       IMWRITE_EXR_COMPRESSION_NO    = 0, //!< no compression
+       IMWRITE_EXR_COMPRESSION_RLE   = 1, //!< run length encoding
+       IMWRITE_EXR_COMPRESSION_ZIPS  = 2, //!< zlib compression, one scan line at a time
+       IMWRITE_EXR_COMPRESSION_ZIP   = 3, //!< zlib compression, in blocks of 16 scan lines
+       IMWRITE_EXR_COMPRESSION_PIZ   = 4, //!< piz-based wavelet compression
+       IMWRITE_EXR_COMPRESSION_PXR24 = 5, //!< lossy 24-bit float compression
+       IMWRITE_EXR_COMPRESSION_B44   = 6, //!< lossy 4-by-4 pixel block compression, fixed compression rate
+       IMWRITE_EXR_COMPRESSION_B44A  = 7, //!< lossy 4-by-4 pixel block compression, flat fields are compressed more
+       IMWRITE_EXR_COMPRESSION_DWAA  = 8, //!< lossy DCT based compression, in blocks of 32 scanlines. More efficient for partial buffer access.
+       IMWRITE_EXR_COMPRESSION_DWAB  = 9, //!< lossy DCT based compression, in blocks of 256 scanlines. More efficient space wise and faster to decode full frames than DWAA_COMPRESSION.
+     };
+
 //! Imwrite PNG specific flags used to tune the compression algorithm.
 /** These flags will be modify the way of PNG image compression and will be passed to the underlying zlib processing stage.
 
diff --git a/modules/imgcodecs/src/apple_conversions.h b/modules/imgcodecs/src/apple_conversions.h
index 27e8955bfc..c68b922cfb 100644
--- a/modules/imgcodecs/src/apple_conversions.h
+++ b/modules/imgcodecs/src/apple_conversions.h
@@ -2,10 +2,10 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+#include "opencv2/core.hpp"
 #import <Accelerate/Accelerate.h>
 #import <AVFoundation/AVFoundation.h>
 #import <ImageIO/ImageIO.h>
-#include "opencv2/core.hpp"
 
 CV_EXPORTS CGImageRef MatToCGImage(const cv::Mat& image) CF_RETURNS_RETAINED;
 CV_EXPORTS void CGImageToMat(const CGImageRef image, cv::Mat& m, bool alphaExist);
diff --git a/modules/imgcodecs/src/exif.cpp b/modules/imgcodecs/src/exif.cpp
index 051999c0fa..28d52047d8 100644
--- a/modules/imgcodecs/src/exif.cpp
+++ b/modules/imgcodecs/src/exif.cpp
@@ -62,7 +62,7 @@ ExifEntry_t::ExifEntry_t() :
 /**
  * @brief ExifReader constructor
  */
-ExifReader::ExifReader(std::istream& stream) : m_stream(stream), m_format(NONE)
+ExifReader::ExifReader() : m_format(NONE)
 {
 }
 
@@ -73,25 +73,6 @@ ExifReader::~ExifReader()
 {
 }
 
-/**
- * @brief Parsing the file and prepare (internally) exif directory structure
- * @return  true if parsing was successful and exif information exists in JpegReader object
- *          false in case of unsuccessful parsing
- */
-bool ExifReader::parse()
-{
-    try {
-        m_exif = getExif();
-        if( !m_exif.empty() )
-        {
-            return true;
-        }
-        return false;
-    } catch (ExifParsingError&) {
-        return false;
-    }
-}
-
 
 /**
  *  @brief Get tag value by tag number
@@ -101,10 +82,10 @@ bool ExifReader::parse()
  *  @return ExifEntru_t structure. Caller has to know what tag it calls in order to extract proper field from the structure ExifEntry_t
  *
  */
-ExifEntry_t ExifReader::getTag(const ExifTagName tag)
+ExifEntry_t ExifReader::getTag(const ExifTagName tag) const
 {
     ExifEntry_t entry;
-    std::map<int, ExifEntry_t>::iterator it = m_exif.find(tag);
+    std::map<int, ExifEntry_t>::const_iterator it = m_exif.find(tag);
 
     if( it != m_exif.end() )
     {
@@ -115,100 +96,37 @@ ExifEntry_t ExifReader::getTag(const ExifTagName tag)
 
 
 /**
- * @brief Get exif directory structure contained in file (if any)
- *          This is internal function and is not exposed to client
+ * @brief Parsing the exif data buffer and prepare (internal) exif directory
  *
- *  @return Map where key is tag number and value is ExifEntry_t structure
- */
-std::map<int, ExifEntry_t > ExifReader::getExif()
-{
-    const std::streamsize markerSize = 2;
-    const std::streamsize offsetToTiffHeader = 6; //bytes from Exif size field to the first TIFF header
-    unsigned char appMarker[markerSize];
-    m_exif.erase( m_exif.begin(), m_exif.end() );
-
-    std::streamsize count;
-
-    bool exifFound = false, stopSearch = false;
-    while( ( !m_stream.eof() ) && !exifFound && !stopSearch )
-    {
-        m_stream.read( reinterpret_cast<char*>(appMarker), markerSize );
-        count = m_stream.gcount();
-        if( count < markerSize )
-        {
-            break;
-        }
-        unsigned char marker = appMarker[1];
-        size_t bytesToSkip;
-        size_t exifSize;
-        switch( marker )
-        {
-            //For all the markers just skip bytes in file pointed by followed two bytes (field size)
-            case SOF0: case SOF2: case DHT: case DQT: case DRI: case SOS:
-            case RST0: case RST1: case RST2: case RST3: case RST4: case RST5: case RST6: case RST7:
-            case APP0: case APP2: case APP3: case APP4: case APP5: case APP6: case APP7: case APP8:
-            case APP9: case APP10: case APP11: case APP12: case APP13: case APP14: case APP15:
-            case COM:
-                bytesToSkip = getFieldSize();
-                if (bytesToSkip < markerSize) {
-                    throw ExifParsingError();
-                }
-                m_stream.seekg( static_cast<long>( bytesToSkip - markerSize ), m_stream.cur );
-                if ( m_stream.fail() ) {
-                    throw ExifParsingError();
-                }
-                break;
-
-            //SOI and EOI don't have the size field after the marker
-            case SOI: case EOI:
-                break;
-
-            case APP1: //actual Exif Marker
-                exifSize = getFieldSize();
-                if (exifSize <= offsetToTiffHeader) {
-                    throw ExifParsingError();
-                }
-                m_data.resize( exifSize - offsetToTiffHeader );
-                m_stream.seekg( static_cast<long>( offsetToTiffHeader ), m_stream.cur );
-                if ( m_stream.fail() ) {
-                    throw ExifParsingError();
-                }
-                m_stream.read( reinterpret_cast<char*>(&m_data[0]), exifSize - offsetToTiffHeader );
-                exifFound = true;
-                break;
-
-            default: //No other markers are expected according to standard. May be a signal of error
-                stopSearch = true;
-                break;
-        }
-    }
-
-    if( !exifFound )
-    {
-        return m_exif;
-    }
-
-    parseExif();
-
-    return m_exif;
-}
-
-/**
- * @brief Get the size of exif field (required to properly ready whole exif from the file)
- *          This is internal function and is not exposed to client
+ * @param [in] data The data buffer to read EXIF data starting with endianness
+ * @param [in] size The size of the data buffer
  *
- *  @return size of exif field in the file
+ * @return  true if parsing was successful
+ *          false in case of unsuccessful parsing
  */
-size_t ExifReader::getFieldSize ()
+bool ExifReader::parseExif(unsigned char* data, const size_t size)
 {
-    unsigned char fieldSize[2];
-    m_stream.read( reinterpret_cast<char*>(fieldSize), 2 );
-    std::streamsize count = m_stream.gcount();
-    if (count < 2)
+    // Populate m_data, then call parseExif() (private)
+    if( data && size > 0 )
     {
-        return 0;
+        m_data.assign(data, data + size);
+    }
+    else
+    {
+        return false;
+    }
+
+    try {
+        parseExif();
+        if( !m_exif.empty() )
+        {
+            return true;
+        }
+        return false;
+    }
+    catch( ExifParsingError& ) {
+        return false;
     }
-    return ( fieldSize[0] << 8 ) + fieldSize[1];
 }
 
 /**
diff --git a/modules/imgcodecs/src/exif.hpp b/modules/imgcodecs/src/exif.hpp
index dc9a58ab0b..6cc95afb1a 100644
--- a/modules/imgcodecs/src/exif.hpp
+++ b/modules/imgcodecs/src/exif.hpp
@@ -54,24 +54,6 @@
 
 namespace cv
 {
-/**
- * @brief Jpeg markers that can encounter in Jpeg file
- */
-enum AppMarkerTypes
-{
-    SOI   = 0xD8, SOF0  = 0xC0, SOF2  = 0xC2, DHT   = 0xC4,
-    DQT   = 0xDB, DRI   = 0xDD, SOS   = 0xDA,
-
-    RST0  = 0xD0, RST1  = 0xD1, RST2  = 0xD2, RST3  = 0xD3,
-    RST4  = 0xD4, RST5  = 0xD5, RST6  = 0xD6, RST7  = 0xD7,
-
-    APP0  = 0xE0, APP1  = 0xE1, APP2  = 0xE2, APP3  = 0xE3,
-    APP4  = 0xE4, APP5  = 0xE5, APP6  = 0xE6, APP7  = 0xE7,
-    APP8  = 0xE8, APP9  = 0xE9, APP10 = 0xEA, APP11 = 0xEB,
-    APP12 = 0xEC, APP13 = 0xED, APP14 = 0xEE, APP15 = 0xEF,
-
-    COM   = 0xFE, EOI   = 0xD9
-};
 
 /**
  * @brief Base Exif tags used by IFD0 (main image)
@@ -168,19 +150,22 @@ class ExifReader
 public:
     /**
      * @brief ExifReader constructor. Constructs an object of exif reader
-     *
-     * @param [in]stream An istream to look for EXIF bytes from
      */
-    explicit ExifReader( std::istream& stream );
+    ExifReader();
     ~ExifReader();
 
 
     /**
      * @brief Parse the file with exif info
      *
-     * @return true if parsing was successful and exif information exists in JpegReader object
+     * @param [in] data The data buffer to read EXIF data starting with endianness
+     * @param [in] size The size of the data buffer
+     *
+     * @return true if successful parsing
+     *         false if parsing error
      */
-    bool parse();
+
+    bool parseExif(unsigned char* data, const size_t size);
 
     /**
      * @brief Get tag info by tag number
@@ -188,10 +173,10 @@ public:
      * @param [in] tag The tag number
      * @return ExifEntru_t structure. Caller has to know what tag it calls in order to extract proper field from the structure ExifEntry_t
      */
-    ExifEntry_t getTag( const ExifTagName tag );
+    ExifEntry_t getTag( const ExifTagName tag ) const;
+
 
 private:
-    std::istream& m_stream;
     std::vector<unsigned char> m_data;
     std::map<int, ExifEntry_t > m_exif;
     Endianess_t m_format;
@@ -199,7 +184,6 @@ private:
     void parseExif();
     bool checkTagMark() const;
 
-    size_t getFieldSize ();
     size_t getNumDirEntry( const size_t offsetNumDir ) const;
     uint32_t getStartOffset() const;
     uint16_t getExifTag( const size_t offset ) const;
@@ -215,7 +199,6 @@ private:
 
     u_rational_t getURational( const size_t offset ) const;
 
-    std::map<int, ExifEntry_t > getExif();
     std::string getString( const size_t offset ) const;
     std::vector<u_rational_t> getResolution( const size_t offset ) const;
     std::vector<u_rational_t> getWhitePoint( const size_t offset ) const;
diff --git a/modules/imgcodecs/src/grfmt_base.cpp b/modules/imgcodecs/src/grfmt_base.cpp
index 574ce9e87a..88f1c04f30 100644
--- a/modules/imgcodecs/src/grfmt_base.cpp
+++ b/modules/imgcodecs/src/grfmt_base.cpp
@@ -55,6 +55,11 @@ BaseImageDecoder::BaseImageDecoder()
     m_scale_denom = 1;
 }
 
+
+ExifEntry_t BaseImageDecoder::getExifTag(const ExifTagName tag) const
+{
+    return m_exif.getTag(tag);
+}
 bool BaseImageDecoder::setSource( const String& filename )
 {
     m_filename = filename;
diff --git a/modules/imgcodecs/src/grfmt_base.hpp b/modules/imgcodecs/src/grfmt_base.hpp
index 7d75636cf5..816bef98fb 100644
--- a/modules/imgcodecs/src/grfmt_base.hpp
+++ b/modules/imgcodecs/src/grfmt_base.hpp
@@ -45,6 +45,7 @@
 
 #include "utils.hpp"
 #include "bitstrm.hpp"
+#include "exif.hpp"
 
 namespace cv
 {
@@ -65,6 +66,7 @@ public:
     int height() const { return m_height; }
     virtual int type() const { return m_type; }
 
+    ExifEntry_t getExifTag(const ExifTagName tag) const;
     virtual bool setSource( const String& filename );
     virtual bool setSource( const Mat& buf );
     virtual int setScale( const int& scale_denom );
@@ -87,6 +89,7 @@ protected:
     String m_signature;
     Mat m_buf;
     bool m_buf_supported;
+    ExifReader m_exif;
 };
 
 
diff --git a/modules/imgcodecs/src/grfmt_exr.cpp b/modules/imgcodecs/src/grfmt_exr.cpp
index 1eceb4f5cd..9242871f8d 100644
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@@ -84,12 +84,13 @@ ExrDecoder::ExrDecoder()
 {
     m_signature = "\x76\x2f\x31\x01";
     m_file = 0;
-    m_red = m_green = m_blue = 0;
+    m_red = m_green = m_blue = m_alpha = 0;
     m_type = ((Imf::PixelType)0);
     m_iscolor = false;
     m_bit_depth = 0;
     m_isfloat = false;
     m_ischroma = false;
+    m_hasalpha = false;
     m_native_depth = false;
 
 }
@@ -113,7 +114,7 @@ void  ExrDecoder::close()
 
 int  ExrDecoder::type() const
 {
-    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), m_iscolor ? 3 : 1);
+    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), ((m_iscolor && m_hasalpha) ? 4 : m_iscolor ? 3 : m_hasalpha ? 2 : 1));
 }
 
 
@@ -141,6 +142,11 @@ bool  ExrDecoder::readHeader()
     m_red = channels.findChannel( "R" );
     m_green = channels.findChannel( "G" );
     m_blue = channels.findChannel( "B" );
+    m_alpha = channels.findChannel( "A" );
+
+    if( m_alpha ) // alpha channel supported in RGB, Y, and YC scenarios
+        m_hasalpha = true;
+
     if( m_red || m_green || m_blue )
     {
         m_iscolor = true;
@@ -178,7 +184,8 @@ bool  ExrDecoder::readHeader()
 bool  ExrDecoder::readData( Mat& img )
 {
     m_native_depth = CV_MAT_DEPTH(type()) == img.depth();
-    bool color = img.channels() > 1;
+    bool color = img.channels() > 2; // output mat has 3+ channels; Y or YA are the 1 and 2 channel scenario
+    bool alphasupported = ( img.channels() % 2 == 0 );  // even number of channels indicates alpha
     int channels = 0;
     uchar* data = img.ptr();
     size_t step = img.step;
@@ -187,18 +194,22 @@ bool  ExrDecoder::readData( Mat& img )
     bool rgbtogray = ( !m_ischroma && m_iscolor && !color );
     bool result = true;
     FrameBuffer frame;
-    int xsample[3] = {1, 1, 1};
+    const int defaultchannels = 3;
+    int xsample[defaultchannels] = {1, 1, 1};
     char *buffer;
-    size_t xstep = 0;
+    CV_Assert(m_type == FLOAT);
+    const size_t floatsize = sizeof(float);
+    size_t xstep = m_native_depth ? floatsize : 1; // 4 bytes if native depth (FLOAT), otherwise converting to 1 byte U8 depth
     size_t ystep = 0;
-
-    xstep = m_native_depth ? 4 : 1;
+    const int channelstoread = ( (m_iscolor && alphasupported) ? 4 :
+                                ( (m_iscolor && !m_ischroma) || color) ? 3 : alphasupported ? 2 : 1 ); // number of channels to read may exceed channels in output img
+    size_t xStride = floatsize * channelstoread;
 
     AutoBuffer<char> copy_buffer;
 
     if( !justcopy )
     {
-        copy_buffer.allocate(sizeof(float) * m_width * 3);
+        copy_buffer.allocate(floatsize * m_width * defaultchannels);
         buffer = copy_buffer.data();
         ystep = 0;
     }
@@ -215,49 +226,49 @@ bool  ExrDecoder::readData( Mat& img )
             if( m_blue )
             {
                 frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-                xsample[0] = m_blue->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+                xsample[0] = m_blue->xSampling;
             }
             else
             {
                 frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, 1, 1, 0.0 ));
             }
             if( m_green )
             {
                 frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-                xsample[1] = m_green->ySampling;
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+                xsample[1] = m_green->xSampling;
             }
             else
             {
                 frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, 1, 1, 0.0 ));
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, 1, 1, 0.0 ));
             }
             if( m_red )
             {
                 frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-                xsample[2] = m_red->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+                xsample[2] = m_red->xSampling;
             }
             else
             {
                 frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, 1, 1, 0.0 ));
             }
         }
         else
         {
             frame.insert( "Y", Slice( m_type,
-                            buffer - m_datawindow.min.x * 4 - m_datawindow.min.y * ystep,
-                            4, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[0] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[0] = m_green->xSampling;
         }
     }
     else
@@ -265,67 +276,85 @@ bool  ExrDecoder::readData( Mat& img )
         if( m_blue )
         {
             frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-            xsample[0] = m_blue->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+            xsample[0] = m_blue->xSampling;
         }
         else
         {
             frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, 1, 1, 0.0 ));
         }
         if( m_green )
         {
             frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[1] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[1] = m_green->xSampling;
         }
         else
         {
             frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, 1, 1, 0.0 ));
         }
         if( m_red )
         {
             frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-            xsample[2] = m_red->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+            xsample[2] = m_red->xSampling;
         }
         else
         {
             frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, 1, 1, 0.0 ));
         }
     }
 
+    if( justcopy && m_hasalpha && alphasupported )
+    { // alpha preserved only in justcopy scenario where alpha is desired (alphasupported)
+      // and present in original file (m_hasalpha)
+        CV_Assert(channelstoread == img.channels());
+        int offset = (channelstoread - 1) * floatsize;
+        frame.insert( "A", Slice( m_type,
+            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + offset,
+            xStride, ystep, m_alpha->xSampling, m_alpha->ySampling, 0.0 ));
+    }
+
     for (FrameBuffer::Iterator it = frame.begin(); it != frame.end(); it++) {
         channels++;
     }
 
+    CV_Assert(channels == channelstoread);
+
+    if( (channels != channelstoread) || (!justcopy && channels > defaultchannels) )
+    { // safety checking what ought to be true here
+        close();
+        return false;
+    }
+
     m_file->setFrameBuffer( frame );
     if( justcopy )
     {
         m_file->readPixels( m_datawindow.min.y, m_datawindow.max.y );
 
-        if( color )
+        if( m_iscolor )
         {
             if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSample( data, 3, step / xstep, xsample[0], m_blue->ySampling );
+                UpSample( data, channelstoread, step / xstep, m_blue->xSampling, m_blue->ySampling );
             if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSample( data + xstep, 3, step / xstep, xsample[1], m_green->ySampling );
+                UpSample( data + xstep, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );
             if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSample( data + 2 * xstep, 3, step / xstep, xsample[2], m_red->ySampling );
+                UpSample( data + 2 * xstep, channelstoread, step / xstep, m_red->xSampling, m_red->ySampling );
         }
         else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-            UpSample( data, 1, step / xstep, xsample[0], m_green->ySampling );
+            UpSample( data, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );
 
         if( chromatorgb )
-            ChromaToBGR( (float *)data, m_height, step / xstep );
+            ChromaToBGR( (float *)data, m_height, channelstoread, step / xstep );
     }
     else
     {
@@ -347,7 +376,7 @@ bool  ExrDecoder::readData( Mat& img )
             else
             {
                 if( chromatorgb )
-                    ChromaToBGR( (float *)buffer, 1, step );
+                    ChromaToBGR( (float *)buffer, 1, defaultchannels, step );
 
                 if( m_type == FLOAT )
                 {
@@ -372,11 +401,11 @@ bool  ExrDecoder::readData( Mat& img )
         if( color )
         {
             if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSampleY( data, 3, step / xstep, m_blue->ySampling );
+                UpSampleY( data, defaultchannels, step / xstep, m_blue->ySampling );
             if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSampleY( data + xstep, 3, step / xstep, m_green->ySampling );
+                UpSampleY( data + xstep, defaultchannels, step / xstep, m_green->ySampling );
             if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSampleY( data + 2 * xstep, 3, step / xstep, m_red->ySampling );
+                UpSampleY( data + 2 * xstep, defaultchannels, step / xstep, m_red->ySampling );
         }
         else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
             UpSampleY( data, 1, step / xstep, m_green->ySampling );
@@ -457,7 +486,7 @@ void  ExrDecoder::UpSampleY( uchar *data, int xstep, int ystep, int ysample )
 /**
 // algorithm from ImfRgbaYca.cpp
  */
-void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
+void  ExrDecoder::ChromaToBGR( float *data, int numlines, int xstep, int ystep )
 {
     for( int y = 0; y < numlines; y++ )
     {
@@ -466,15 +495,15 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
             double b, Y, r;
             if( m_type == FLOAT )
             {
-                b = data[y * step + x * 3];
-                Y = data[y * step + x * 3 + 1];
-                r = data[y * step + x * 3 + 2];
+                b = data[y * ystep + x * xstep];
+                Y = data[y * ystep + x * xstep + 1];
+                r = data[y * ystep + x * xstep + 2];
             }
             else
             {
-                b = ((unsigned *)data)[y * step + x * 3];
-                Y = ((unsigned *)data)[y * step + x * 3 + 1];
-                r = ((unsigned *)data)[y * step + x * 3 + 2];
+                b = ((unsigned *)data)[y * ystep + x * xstep];
+                Y = ((unsigned *)data)[y * ystep + x * xstep + 1];
+                r = ((unsigned *)data)[y * ystep + x * xstep + 2];
             }
             r = (r + 1) * Y;
             b = (b + 1) * Y;
@@ -482,18 +511,18 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
 
             if( m_type == FLOAT )
             {
-                data[y * step + x * 3] = (float)b;
-                data[y * step + x * 3 + 1] = (float)Y;
-                data[y * step + x * 3 + 2] = (float)r;
+                data[y * ystep + x * xstep] = (float)b;
+                data[y * ystep + x * xstep + 1] = (float)Y;
+                data[y * ystep + x * xstep + 2] = (float)r;
             }
             else
             {
                 int t = cvRound(b);
-                ((unsigned *)data)[y * step + x * 3 + 0] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 0] = (unsigned)MAX(t, 0);
                 t = cvRound(Y);
-                ((unsigned *)data)[y * step + x * 3 + 1] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 1] = (unsigned)MAX(t, 0);
                 t = cvRound(r);
-                ((unsigned *)data)[y * step + x * 3 + 2] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 2] = (unsigned)MAX(t, 0);
             }
         }
     }
@@ -571,7 +600,6 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
     int depth = img.depth();
     CV_Assert( depth == CV_32F );
     int channels = img.channels();
-    CV_Assert( channels == 3 || channels == 1 );
     bool result = false;
     Header header( width, height );
     Imf::PixelType type = FLOAT;
@@ -589,12 +617,50 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
                 type = FLOAT;
                 break;
             default:
-                throw std::runtime_error( "IMWRITE_EXR_TYPE is invalid or not supported" );
+                CV_Error(Error::StsBadArg, "IMWRITE_EXR_TYPE is invalid or not supported");
+            }
+        }
+        if ( params[i] == IMWRITE_EXR_COMPRESSION )
+        {
+            switch ( params[i + 1] )
+            {
+            case IMWRITE_EXR_COMPRESSION_NO:
+                header.compression() = NO_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_RLE:
+                header.compression() = RLE_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_ZIPS:
+                header.compression() = ZIPS_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_ZIP:
+                header.compression() = ZIP_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_PIZ:
+                header.compression() = PIZ_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_PXR24:
+                header.compression() = PXR24_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_B44:
+                header.compression() = B44_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_B44A:
+                header.compression() = B44A_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_DWAA:
+                header.compression() = DWAA_COMPRESSION;
+                break;
+            case IMWRITE_EXR_COMPRESSION_DWAB:
+                header.compression() = DWAB_COMPRESSION;
+                break;
+            default:
+                CV_Error(Error::StsBadArg, "IMWRITE_EXR_COMPRESSION is invalid or not supported");
             }
         }
     }
 
-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
     {
         header.channels().insert( "R", Channel( type ) );
         header.channels().insert( "G", Channel( type ) );
@@ -607,6 +673,11 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
         //printf("gray\n");
     }
 
+    if( channels % 2 == 0 )
+    { // even number of channels indicates Alpha
+        header.channels().insert( "A", Channel( type ) );
+    }
+
     OutputFile file( m_filename.c_str(), header );
 
     FrameBuffer frame;
@@ -629,14 +700,19 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
         size = 4;
     }
 
-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
     {
-        frame.insert( "B", Slice( type, buffer, size * 3, bufferstep ));
-        frame.insert( "G", Slice( type, buffer + size, size * 3, bufferstep ));
-        frame.insert( "R", Slice( type, buffer + size * 2, size * 3, bufferstep ));
+        frame.insert( "B", Slice( type, buffer, size * channels, bufferstep ));
+        frame.insert( "G", Slice( type, buffer + size, size * channels, bufferstep ));
+        frame.insert( "R", Slice( type, buffer + size * 2, size * channels, bufferstep ));
     }
     else
-        frame.insert( "Y", Slice( type, buffer, size, bufferstep ));
+        frame.insert( "Y", Slice( type, buffer, size * channels, bufferstep ));
+
+    if( channels % 2 == 0 )
+    { // even channel count indicates Alpha channel
+        frame.insert( "A", Slice( type, buffer + size * (channels - 1), size * channels, bufferstep ));
+    }
 
     file.setFrameBuffer( frame );
 
diff --git a/modules/imgcodecs/src/grfmt_exr.hpp b/modules/imgcodecs/src/grfmt_exr.hpp
index ec08028e22..99acd775c2 100644
--- a/modules/imgcodecs/src/grfmt_exr.hpp
+++ b/modules/imgcodecs/src/grfmt_exr.hpp
@@ -81,7 +81,7 @@ protected:
     void  UpSample( uchar *data, int xstep, int ystep, int xsample, int ysample );
     void  UpSampleX( float *data, int xstep, int xsample );
     void  UpSampleY( uchar *data, int xstep, int ystep, int ysample );
-    void  ChromaToBGR( float *data, int numlines, int step );
+    void  ChromaToBGR( float *data, int numlines, int xstep, int ystep );
     void  RGBToGray( float *in, float *out );
 
     InputFile      *m_file;
@@ -91,11 +91,13 @@ protected:
     const Channel  *m_red;
     const Channel  *m_green;
     const Channel  *m_blue;
+    const Channel  *m_alpha;
     Chromaticities  m_chroma;
     int             m_bit_depth;
     bool            m_native_depth;
     bool            m_iscolor;
     bool            m_isfloat;
+    bool            m_hasalpha;
 
 private:
     ExrDecoder(const ExrDecoder &); // copy disabled
diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index cbc1bedaee..3dd9d68771 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -244,6 +244,7 @@ bool  JpegDecoder::readHeader()
 
         if (state->cinfo.src != 0)
         {
+            jpeg_save_markers(&state->cinfo, APP1, 0xffff);
             jpeg_read_header( &state->cinfo, TRUE );
 
             state->cinfo.scale_num=1;
@@ -456,6 +457,29 @@ bool  JpegDecoder::readData( Mat& img )
                 }
             }
 
+            // Check for Exif marker APP1
+            jpeg_saved_marker_ptr exif_marker = NULL;
+            jpeg_saved_marker_ptr cmarker = cinfo->marker_list;
+            while( cmarker && exif_marker == NULL )
+            {
+                if (cmarker->marker == APP1)
+                    exif_marker = cmarker;
+
+                cmarker = cmarker->next;
+            }
+
+            // Parse Exif data
+            if( exif_marker )
+            {
+                const std::streamsize offsetToTiffHeader = 6; //bytes from Exif size field to the first TIFF header
+
+                if (exif_marker->data_length > offsetToTiffHeader)
+                {
+                    m_exif.parseExif(exif_marker->data + offsetToTiffHeader, exif_marker->data_length - offsetToTiffHeader);
+                }
+            }
+
+
             jpeg_start_decompress( cinfo );
 
             buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo,
diff --git a/modules/imgcodecs/src/grfmt_jpeg.hpp b/modules/imgcodecs/src/grfmt_jpeg.hpp
index 90d80b4b59..e7c8c25457 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.hpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.hpp
@@ -52,6 +52,25 @@
 
 namespace cv
 {
+/**
+* @brief Jpeg markers that can be encountered in a Jpeg file
+*/
+enum AppMarkerTypes
+{
+    SOI = 0xD8, SOF0 = 0xC0, SOF2 = 0xC2, DHT = 0xC4,
+    DQT = 0xDB, DRI = 0xDD, SOS = 0xDA,
+
+    RST0 = 0xD0, RST1 = 0xD1, RST2 = 0xD2, RST3 = 0xD3,
+    RST4 = 0xD4, RST5 = 0xD5, RST6 = 0xD6, RST7 = 0xD7,
+
+    APP0 = 0xE0, APP1 = 0xE1, APP2 = 0xE2, APP3 = 0xE3,
+    APP4 = 0xE4, APP5 = 0xE5, APP6 = 0xE6, APP7 = 0xE7,
+    APP8 = 0xE8, APP9 = 0xE9, APP10 = 0xEA, APP11 = 0xEB,
+    APP12 = 0xEC, APP13 = 0xED, APP14 = 0xEE, APP15 = 0xEF,
+
+    COM = 0xFE, EOI = 0xD9
+};
+
 
 class JpegDecoder CV_FINAL : public BaseImageDecoder
 {
diff --git a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
index 149b34c927..73d49282d7 100644
--- a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
@@ -495,20 +495,16 @@ detail::StreamPtr opjCreateBufferInputStream(detail::OpjMemoryBuffer* buf)
 
 /////////////////////// Jpeg2KOpjDecoder ///////////////////
 
-Jpeg2KOpjDecoder::Jpeg2KOpjDecoder()
+namespace detail {
+
+Jpeg2KOpjDecoderBase::Jpeg2KOpjDecoderBase(OPJ_CODEC_FORMAT format)
+    : format_(format)
 {
-    static const unsigned char signature[] = { 0, 0, 0, 0x0c, 'j', 'P', ' ', ' ', 13, 10, 0x87, 10 };
-    m_signature = String((const char*)(signature), sizeof(signature));
     m_buf_supported = true;
 }
 
 
-ImageDecoder Jpeg2KOpjDecoder::newDecoder() const
-{
-    return makePtr<Jpeg2KOpjDecoder>();
-}
-
-bool Jpeg2KOpjDecoder::readHeader()
+bool Jpeg2KOpjDecoderBase::readHeader()
 {
     if (!m_buf.empty()) {
         opjBuf_ = detail::OpjMemoryBuffer(m_buf);
@@ -521,7 +517,7 @@ bool Jpeg2KOpjDecoder::readHeader()
     if (!stream_)
         return false;
 
-    codec_.reset(opj_create_decompress(OPJ_CODEC_JP2));
+    codec_.reset(opj_create_decompress(format_));
     if (!codec_)
         return false;
 
@@ -587,7 +583,7 @@ bool Jpeg2KOpjDecoder::readHeader()
     return true;
 }
 
-bool Jpeg2KOpjDecoder::readData( Mat& img )
+bool Jpeg2KOpjDecoderBase::readData( Mat& img )
 {
     using DecodeFunc = bool(*)(const opj_image_t&, cv::Mat&, uint8_t shift);
 
@@ -606,7 +602,9 @@ bool Jpeg2KOpjDecoder::readData( Mat& img )
     switch (image_->color_space)
     {
     case OPJ_CLRSPC_UNKNOWN:
-        CV_LOG_WARNING(NULL, "OpenJPEG2000: Image has unknown color space, SRGB is assumed");
+        /* FALLTHRU */
+    case OPJ_CLRSPC_UNSPECIFIED:
+        CV_LOG_WARNING(NULL, "OpenJPEG2000: Image has unknown or unspecified color space, SRGB is assumed");
         /* FALLTHRU */
     case OPJ_CLRSPC_SRGB:
         decode = decodeSRGBData;
@@ -617,8 +615,6 @@ bool Jpeg2KOpjDecoder::readData( Mat& img )
     case OPJ_CLRSPC_SYCC:
         decode = decodeSYCCData;
         break;
-    case OPJ_CLRSPC_UNSPECIFIED:
-        CV_Error(Error::StsNotImplemented, "OpenJPEG2000: Image has unspecified color space");
     default:
         CV_Error(Error::StsNotImplemented,
                  cv::format("OpenJPEG2000: Unsupported color space conversion: %s -> %s",
@@ -654,6 +650,31 @@ bool Jpeg2KOpjDecoder::readData( Mat& img )
     return decode(*image_, img, shift);
 }
 
+} // namespace detail
+
+Jpeg2KJP2OpjDecoder::Jpeg2KJP2OpjDecoder()
+    : Jpeg2KOpjDecoderBase(OPJ_CODEC_JP2)
+{
+    static const unsigned char JP2Signature[] = { 0, 0, 0, 0x0c, 'j', 'P', ' ', ' ', 13, 10, 0x87, 10 };
+    m_signature = String((const char*) JP2Signature, sizeof(JP2Signature));
+}
+
+ImageDecoder Jpeg2KJP2OpjDecoder::newDecoder() const
+{
+    return makePtr<Jpeg2KJP2OpjDecoder>();
+}
+
+Jpeg2KJ2KOpjDecoder::Jpeg2KJ2KOpjDecoder()
+    : Jpeg2KOpjDecoderBase(OPJ_CODEC_J2K)
+{
+    static const unsigned char J2KSignature[] = { 0xff, 0x4f, 0xff, 0x51 };
+    m_signature = String((const char*) J2KSignature, sizeof(J2KSignature));
+}
+
+ImageDecoder Jpeg2KJ2KOpjDecoder::newDecoder() const
+{
+    return makePtr<Jpeg2KJ2KOpjDecoder>();
+}
 
 /////////////////////// Jpeg2KOpjEncoder ///////////////////
 
diff --git a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.hpp b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.hpp
index 14b888620b..7c8a4af396 100644
--- a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.hpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.hpp
@@ -59,15 +59,11 @@ using StreamPtr = std::unique_ptr<opj_stream_t, detail::OpjStreamDeleter>;
 using CodecPtr = std::unique_ptr<opj_codec_t, detail::OpjCodecDeleter>;
 using ImagePtr = std::unique_ptr<opj_image_t, detail::OpjImageDeleter>;
 
-} // namespace detail
-
-class Jpeg2KOpjDecoder CV_FINAL : public BaseImageDecoder
+class Jpeg2KOpjDecoderBase : public BaseImageDecoder
 {
 public:
-    Jpeg2KOpjDecoder();
-     ~Jpeg2KOpjDecoder() CV_OVERRIDE = default;
+    Jpeg2KOpjDecoderBase(OPJ_CODEC_FORMAT format);
 
-    ImageDecoder newDecoder() const CV_OVERRIDE;
     bool readData( Mat& img ) CV_OVERRIDE;
     bool readHeader() CV_OVERRIDE;
 
@@ -79,6 +75,23 @@ private:
     detail::OpjMemoryBuffer opjBuf_;
 
     OPJ_UINT32 m_maxPrec = 0;
+    OPJ_CODEC_FORMAT format_;
+};
+
+} // namespace detail
+
+class Jpeg2KJP2OpjDecoder CV_FINAL : public detail::Jpeg2KOpjDecoderBase {
+public:
+    Jpeg2KJP2OpjDecoder();
+
+    ImageDecoder newDecoder() const CV_OVERRIDE;
+};
+
+class Jpeg2KJ2KOpjDecoder CV_FINAL : public detail::Jpeg2KOpjDecoderBase {
+public:
+    Jpeg2KJ2KOpjDecoder();
+
+    ImageDecoder newDecoder() const CV_OVERRIDE;
 };
 
 class Jpeg2KOpjEncoder CV_FINAL : public BaseImageEncoder
diff --git a/modules/imgcodecs/src/grfmt_png.cpp b/modules/imgcodecs/src/grfmt_png.cpp
index b533cd849f..9e1a2d4c71 100644
--- a/modules/imgcodecs/src/grfmt_png.cpp
+++ b/modules/imgcodecs/src/grfmt_png.cpp
@@ -284,6 +284,22 @@ bool  PngDecoder::readData( Mat& img )
             png_read_image( png_ptr, buffer );
             png_read_end( png_ptr, end_info );
 
+#ifdef PNG_eXIf_SUPPORTED
+            png_uint_32 num_exif = 0;
+            png_bytep exif = 0;
+
+            // Exif info could be in info_ptr (intro_info) or end_info per specification
+            if( png_get_valid(png_ptr, info_ptr, PNG_INFO_eXIf) )
+                png_get_eXIf_1(png_ptr, info_ptr, &num_exif, &exif);
+            else if( png_get_valid(png_ptr, end_info, PNG_INFO_eXIf) )
+                png_get_eXIf_1(png_ptr, end_info, &num_exif, &exif);
+
+            if( exif && num_exif > 0 )
+            {
+                m_exif.parseExif(exif, num_exif);
+            }
+#endif
+
             result = true;
         }
     }
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index b28a861275..350042cd7d 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -179,7 +179,8 @@ struct ImageCodecInitializer
         encoders.push_back( makePtr<Jpeg2KEncoder>() );
     #endif
     #ifdef HAVE_OPENJPEG
-        decoders.push_back( makePtr<Jpeg2KOpjDecoder>() );
+        decoders.push_back( makePtr<Jpeg2KJP2OpjDecoder>() );
+        decoders.push_back( makePtr<Jpeg2KJ2KOpjDecoder>() );
         encoders.push_back( makePtr<Jpeg2KOpjEncoder>() );
     #endif
     #ifdef HAVE_OPENEXR
@@ -360,48 +361,15 @@ static void ExifTransform(int orientation, Mat& img)
     }
 }
 
-static void ApplyExifOrientation(const String& filename, Mat& img)
+static void ApplyExifOrientation(ExifEntry_t orientationTag, Mat& img)
 {
     int orientation = IMAGE_ORIENTATION_TL;
 
-    if (filename.size() > 0)
+    if (orientationTag.tag != INVALID_TAG)
     {
-        std::ifstream stream( filename.c_str(), std::ios_base::in | std::ios_base::binary );
-        ExifReader reader( stream );
-        if( reader.parse() )
-        {
-            ExifEntry_t entry = reader.getTag( ORIENTATION );
-            if (entry.tag != INVALID_TAG)
-            {
-                orientation = entry.field_u16; //orientation is unsigned short, so check field_u16
-            }
-        }
-        stream.close();
+        orientation = orientationTag.field_u16; //orientation is unsigned short, so check field_u16
+        ExifTransform(orientation, img);
     }
-
-    ExifTransform(orientation, img);
-}
-
-static void ApplyExifOrientation(const Mat& buf, Mat& img)
-{
-    int orientation = IMAGE_ORIENTATION_TL;
-
-    if( buf.isContinuous() )
-    {
-        ByteStreamBuffer bsb( reinterpret_cast<char*>(buf.data), buf.total() * buf.elemSize() );
-        std::istream stream( &bsb );
-        ExifReader reader( stream );
-        if( reader.parse() )
-        {
-            ExifEntry_t entry = reader.getTag( ORIENTATION );
-            if (entry.tag != INVALID_TAG)
-            {
-                orientation = entry.field_u16; //orientation is unsigned short, so check field_u16
-            }
-        }
-    }
-
-    ExifTransform(orientation, img);
 }
 
 /**
@@ -517,6 +485,12 @@ imread_( const String& filename, int flags, Mat& mat )
         resize( mat, mat, Size( size.width / scale_denom, size.height / scale_denom ), 0, 0, INTER_LINEAR_EXACT);
     }
 
+    /// optionally rotate the data if EXIF orientation flag says so
+    if (!mat.empty() && (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
+    {
+        ApplyExifOrientation(decoder->getExifTag(ORIENTATION), mat);
+    }
+
     return true;
 }
 
@@ -613,7 +587,7 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
         // optionally rotate the data if EXIF' orientation flag says so
         if( (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
         {
-            ApplyExifOrientation(filename, mat);
+            ApplyExifOrientation(decoder->getExifTag(ORIENTATION), mat);
         }
 
         mats.push_back(mat);
@@ -644,12 +618,6 @@ Mat imread( const String& filename, int flags )
     /// load the data
     imread_( filename, flags, img );
 
-    /// optionally rotate the data if EXIF' orientation flag says so
-    if( !img.empty() && (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
-    {
-        ApplyExifOrientation(filename, img);
-    }
-
     /// return a reference to the data
     return img;
 }
@@ -888,6 +856,12 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
         resize(mat, mat, Size( size.width / scale_denom, size.height / scale_denom ), 0, 0, INTER_LINEAR_EXACT);
     }
 
+    /// optionally rotate the data if EXIF' orientation flag says so
+    if (!mat.empty() && (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED)
+    {
+        ApplyExifOrientation(decoder->getExifTag(ORIENTATION), mat);
+    }
+
     return true;
 }
 
@@ -899,12 +873,6 @@ Mat imdecode( InputArray _buf, int flags )
     Mat buf = _buf.getMat(), img;
     imdecode_( buf, flags, img );
 
-    /// optionally rotate the data if EXIF' orientation flag says so
-    if( !img.empty() && (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
-    {
-        ApplyExifOrientation(buf, img);
-    }
-
     return img;
 }
 
@@ -916,12 +884,6 @@ Mat imdecode( InputArray _buf, int flags, Mat* dst )
     dst = dst ? dst : &img;
     imdecode_( buf, flags, *dst );
 
-    /// optionally rotate the data if EXIF' orientation flag says so
-    if( !dst->empty() && (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
-    {
-        ApplyExifOrientation(buf, *dst);
-    }
-
     return *dst;
 }
 
diff --git a/modules/imgcodecs/src/macosx_conversions.mm b/modules/imgcodecs/src/macosx_conversions.mm
index c1827e71f1..0023e06260 100644
--- a/modules/imgcodecs/src/macosx_conversions.mm
+++ b/modules/imgcodecs/src/macosx_conversions.mm
@@ -2,8 +2,8 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
-#import <AppKit/AppKit.h>
 #include "apple_conversions.h"
+#import <AppKit/AppKit.h>
 
 CV_EXPORTS NSImage* MatToNSImage(const cv::Mat& image);
 CV_EXPORTS void NSImageToMat(const NSImage* image, cv::Mat& m, bool alphaExist);
diff --git a/modules/imgcodecs/test/test_exr.impl.hpp b/modules/imgcodecs/test/test_exr.impl.hpp
index 1f78a8f38f..2418d9d817 100644
--- a/modules/imgcodecs/test/test_exr.impl.hpp
+++ b/modules/imgcodecs/test/test_exr.impl.hpp
@@ -6,8 +6,19 @@
 
 namespace opencv_test { namespace {
 
-TEST(Imgcodecs_EXR, readWrite_32FC1)
+size_t getFileSize(const string& filename)
 {
+    std::ifstream ifs(filename.c_str(), std::ios::in | std::ios::binary);
+    if (ifs.is_open())
+    {
+        ifs.seekg(0, std::ios::end);
+        return (size_t)ifs.tellg();
+    }
+    return 0;
+}
+
+TEST(Imgcodecs_EXR, readWrite_32FC1)
+{ // Y channels
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC1.exr";
     const string filenameOutput = cv::tempfile(".exr");
@@ -23,6 +34,8 @@ TEST(Imgcodecs_EXR, readWrite_32FC1)
     ASSERT_EQ(CV_32FC1,img.type());
 
     ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    // Check generated file size to ensure that it's compressed with proper options
+    ASSERT_EQ(396u, getFileSize(filenameOutput));
     const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
     ASSERT_EQ(img2.type(), img.type());
     ASSERT_EQ(img2.size(), img.size());
@@ -31,7 +44,7 @@ TEST(Imgcodecs_EXR, readWrite_32FC1)
 }
 
 TEST(Imgcodecs_EXR, readWrite_32FC3)
-{
+{ // RGB channels
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC3.exr";
     const string filenameOutput = cv::tempfile(".exr");
@@ -113,5 +126,183 @@ TEST(Imgcodecs_EXR, readWrite_32FC3_half)
     EXPECT_EQ(0, remove(filenameOutput.c_str()));
 }
 
+TEST(Imgcodecs_EXR, readWrite_32FC1_PIZ)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test32FC1.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    std::vector<int> params;
+    params.push_back(IMWRITE_EXR_COMPRESSION);
+    params.push_back(IMWRITE_EXR_COMPRESSION_PIZ);
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img, params));
+    // Check generated file size to ensure that it's compressed with proper options
+    ASSERT_EQ(849u, getFileSize(filenameOutput));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+// Note: YC to GRAYSCALE (IMREAD_GRAYSCALE | IMREAD_ANYDEPTH)
+// outputs a black image,
+// as does Y to RGB (IMREAD_COLOR | IMREAD_ANYDEPTH).
+// This behavoir predates adding EXR alpha support issue
+// 16115.
+
+TEST(Imgcodecs_EXR, read_YA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    // Writing Y covered by test 32FC1
+}
+
+TEST(Imgcodecs_EXR, read_YA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC2, img.type());
+
+    // Cannot test writing, 2 channel writing not suppported by loadsave
+}
+
+TEST(Imgcodecs_EXR, read_YC_changeDepth)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_8UC3, img.type());
+
+    // Cannot test writing, EXR encoder doesn't support 8U depth
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_YC_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing YC covered by test readwrite_YCA_ignore_alpha
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, readwrite_RGBA_togreyscale)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing RGB covered by test 32FC3
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+#ifndef GENERATE_DATA
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+#else
+    const Size sz(64, 32);
+    Mat img(sz, CV_32FC4, Scalar(0.5, 0.1, 1, 1));
+    img(Rect(10, 5, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 0, 0, 1));
+    img(Rect(10, 20, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 1, 0, 0));
+    ASSERT_TRUE(cv::imwrite(filenameInput, img));
+#endif
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
 
 }} // namespace
diff --git a/modules/imgcodecs/test/test_grfmt.cpp b/modules/imgcodecs/test/test_grfmt.cpp
index 7d6930ef5d..0d28688004 100644
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@@ -78,7 +78,9 @@ const string all_images[] =
     "readwrite/Bretagne2.jp2",
     "readwrite/Grey.jp2",
     "readwrite/Grey.jp2",
+    "readwrite/balloon.j2c",
 #endif
+
 #ifdef HAVE_GDCM
     "readwrite/int16-mono1.dcm",
     "readwrite/uint8-mono2.dcm",
@@ -109,11 +111,11 @@ INSTANTIATE_TEST_CASE_P(All, Imgcodecs_FileMode,
                             testing::ValuesIn(all_images),
                             testing::ValuesIn(basic_modes)));
 
-// GDAL does not support "hdr", "dcm" and have problems with "jp2"
+// GDAL does not support "hdr", "dcm" and has problems with JPEG2000 files (jp2, j2c)
 struct notForGDAL {
     bool operator()(const string &name) const {
         const string &ext = name.substr(name.size() - 3, 3);
-        return ext == "hdr" || ext == "dcm" || ext == "jp2" ||
+        return ext == "hdr" || ext == "dcm" || ext == "jp2" || ext == "j2c" ||
                 name.find("rle8.bmp") != std::string::npos;
     }
 };
diff --git a/modules/imgcodecs/test/test_png.cpp b/modules/imgcodecs/test/test_png.cpp
index 051cbf69e6..f71fabc7e4 100644
--- a/modules/imgcodecs/test/test_png.cpp
+++ b/modules/imgcodecs/test/test_png.cpp
@@ -93,6 +93,99 @@ TEST(Imgcodecs_Png, read_color_palette_with_alpha)
     EXPECT_EQ(img.at<Vec3b>(0, 1), Vec3b(0, 0, 255));
 }
 
+/**
+ * Test for check whether reading exif orientation tag was processed successfully or not
+ * The test info is the set of 8 images named testExifRotate_{1 to 8}.png
+ * The test image is the square 10x10 points divided by four sub-squares:
+ * (R corresponds to Red, G to Green, B to Blue, W to white)
+ * ---------             ---------
+ * | R | G |             | G | R |
+ * |-------| - (tag 1)   |-------| - (tag 2)
+ * | B | W |             | W | B |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | W | B |             | B | W |
+ * |-------| - (tag 3)   |-------| - (tag 4)
+ * | G | R |             | R | G |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | R | B |             | G | W |
+ * |-------| - (tag 5)   |-------| - (tag 6)
+ * | G | W |             | R | B |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | W | G |             | B | R |
+ * |-------| - (tag 7)   |-------| - (tag 8)
+ * | B | R |             | W | G |
+ * ---------             ---------
+ *
+ *
+ * Every image contains exif field with orientation tag (0x112)
+ * After reading each image and applying the orientation tag,
+ * the resulting image should be:
+ * ---------
+ * | R | G |
+ * |-------|
+ * | B | W |
+ * ---------
+ *
+ */
+
+typedef testing::TestWithParam<string> Imgcodecs_PNG_Exif;
+
+// Solution to issue 16579: PNG read doesn't support Exif orientation data
+#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
+TEST_P(Imgcodecs_PNG_Exif, exif_orientation)
+#else
+TEST_P(Imgcodecs_PNG_Exif, DISABLED_exif_orientation)
+#endif
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filename = root + GetParam();
+    const int colorThresholdHigh = 250;
+    const int colorThresholdLow = 5;
+
+    Mat m_img = imread(filename);
+    ASSERT_FALSE(m_img.empty());
+    Vec3b vec;
+
+    //Checking the first quadrant (with supposed red)
+    vec = m_img.at<Vec3b>(2, 2); //some point inside the square
+    EXPECT_LE(vec.val[0], colorThresholdLow);
+    EXPECT_LE(vec.val[1], colorThresholdLow);
+    EXPECT_GE(vec.val[2], colorThresholdHigh);
+
+    //Checking the second quadrant (with supposed green)
+    vec = m_img.at<Vec3b>(2, 7);  //some point inside the square
+    EXPECT_LE(vec.val[0], colorThresholdLow);
+    EXPECT_GE(vec.val[1], colorThresholdHigh);
+    EXPECT_LE(vec.val[2], colorThresholdLow);
+
+    //Checking the third quadrant (with supposed blue)
+    vec = m_img.at<Vec3b>(7, 2);  //some point inside the square
+    EXPECT_GE(vec.val[0], colorThresholdHigh);
+    EXPECT_LE(vec.val[1], colorThresholdLow);
+    EXPECT_LE(vec.val[2], colorThresholdLow);
+}
+
+const string exif_files[] =
+{
+    "readwrite/testExifOrientation_1.png",
+    "readwrite/testExifOrientation_2.png",
+    "readwrite/testExifOrientation_3.png",
+    "readwrite/testExifOrientation_4.png",
+    "readwrite/testExifOrientation_5.png",
+    "readwrite/testExifOrientation_6.png",
+    "readwrite/testExifOrientation_7.png",
+    "readwrite/testExifOrientation_8.png"
+};
+
+INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_PNG_Exif,
+    testing::ValuesIn(exif_files));
+
 #endif // HAVE_PNG
 
 }} // namespace
diff --git a/modules/imgproc/doc/colors.markdown b/modules/imgproc/doc/colors.markdown
index f1023621ad..47d97a7263 100644
--- a/modules/imgproc/doc/colors.markdown
+++ b/modules/imgproc/doc/colors.markdown
@@ -56,7 +56,10 @@ scaled to fit the 0 to 1 range.
 
 \f[V  \leftarrow max(R,G,B)\f]
 \f[S  \leftarrow \fork{\frac{V-min(R,G,B)}{V}}{if \(V \neq 0\)}{0}{otherwise}\f]
-\f[H  \leftarrow \forkthree{{60(G - B)}/{(V-min(R,G,B))}}{if \(V=R\)}{{120+60(B - R)}/{(V-min(R,G,B))}}{if \(V=G\)}{{240+60(R - G)}/{(V-min(R,G,B))}}{if \(V=B\)}\f]
+\f[H  \leftarrow \forkfour{{60(G - B)}/{(V-min(R,G,B))}}{if \(V=R\)}
+  {{120+60(B - R)}/{(V-min(R,G,B))}}{if \(V=G\)}
+  {{240+60(R - G)}/{(V-min(R,G,B))}}{if \(V=B\)}
+  {0}{if  \(R=G=B\)}\f]
 If \f$H<0\f$ then \f$H \leftarrow H+360\f$ . On output \f$0 \leq V \leq 1\f$, \f$0 \leq S \leq 1\f$,
 \f$0 \leq H \leq 360\f$ .
 
@@ -78,9 +81,10 @@ scaled to fit the 0 to 1 range.
 \f[L  \leftarrow \frac{V_{max} + V_{min}}{2}\f]
 \f[S  \leftarrow \fork { \frac{V_{max} - V_{min}}{V_{max} + V_{min}} }{if  \(L < 0.5\) }
     { \frac{V_{max} - V_{min}}{2 - (V_{max} + V_{min})} }{if  \(L \ge 0.5\) }\f]
-\f[H  \leftarrow \forkthree {{60(G - B)}/{(V_{max}-V_{min})}}{if  \(V_{max}=R\) }
+\f[H  \leftarrow \forkfour {{60(G - B)}/{(V_{max}-V_{min})}}{if  \(V_{max}=R\) }
   {{120+60(B - R)}/{(V_{max}-V_{min})}}{if  \(V_{max}=G\) }
-  {{240+60(R - G)}/{(V_{max}-V_{min})}}{if  \(V_{max}=B\) }\f]
+  {{240+60(R - G)}/{(V_{max}-V_{min})}}{if  \(V_{max}=B\) }
+  {0}{if  \(R=G=B\) }\f]
 If \f$H<0\f$ then \f$H \leftarrow H+360\f$ . On output \f$0 \leq L \leq 1\f$, \f$0 \leq S \leq
 1\f$, \f$0 \leq H \leq 360\f$ .
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 7f0b785332..b82afce292 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -185,6 +185,7 @@ location of points on the plane, building special graphs (such as NNG,RNG), and
     @defgroup imgproc_motion Motion Analysis and Object Tracking
     @defgroup imgproc_feature Feature Detection
     @defgroup imgproc_object Object Detection
+    @defgroup imgproc_segmentation Image Segmentation
     @defgroup imgproc_c C API
     @defgroup imgproc_hal Hardware Acceleration Layer
     @{
@@ -404,9 +405,13 @@ enum ConnectedComponentsTypes {
 
 //! connected components algorithm
 enum ConnectedComponentsAlgorithmsTypes {
-    CCL_WU      = 0,  //!< SAUF @cite Wu2009 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity
-    CCL_DEFAULT = -1, //!< BBDT algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity
-    CCL_GRANA   = 1   //!< BBDT algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity
+    CCL_DEFAULT   = -1, //!< BBDT @cite Grana2010 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both BBDT and SAUF.
+    CCL_WU        = 0,  //!< SAUF @cite Wu2009 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for SAUF.
+    CCL_GRANA     = 1,  //!< BBDT @cite Grana2010 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both BBDT and SAUF.
+    CCL_BOLELLI   = 2,  //!< Spaghetti @cite Bolelli2019 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity.
+    CCL_SAUF      = 3,  //!< Same as CCL_WU. It is preferable to use the flag with the name of the algorithm (CCL_SAUF) rather than the one with the name of the first author (CCL_WU).
+    CCL_BBDT      = 4,  //!< Same as CCL_GRANA. It is preferable to use the flag with the name of the algorithm (CCL_BBDT) rather than the one with the name of the first author (CCL_GRANA).
+    CCL_SPAGHETTI = 5,  //!< Same as CCL_BOLELLI. It is preferable to use the flag with the name of the algorithm (CCL_SPAGHETTI) rather than the one with the name of the first author (CCL_BOLELLI).
 };
 
 //! mode of the contour retrieval algorithm
@@ -578,7 +583,7 @@ enum ColorConversionCodes {
     COLOR_YCrCb2BGR    = 38,
     COLOR_YCrCb2RGB    = 39,
 
-    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value), @ref color_convert_rgb_hsv "color conversions"
+    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV      = 41,
 
     COLOR_BGR2Lab      = 44, //!< convert RGB/BGR to CIE Lab, @ref color_convert_rgb_lab "color conversions"
@@ -586,27 +591,27 @@ enum ColorConversionCodes {
 
     COLOR_BGR2Luv      = 50, //!< convert RGB/BGR to CIE Luv, @ref color_convert_rgb_luv "color conversions"
     COLOR_RGB2Luv      = 51,
-    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation), @ref color_convert_rgb_hls "color conversions"
+    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS      = 53,
 
-    COLOR_HSV2BGR      = 54, //!< backward conversions to RGB/BGR
+    COLOR_HSV2BGR      = 54, //!< backward conversions HSV to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HSV2RGB      = 55,
 
     COLOR_Lab2BGR      = 56,
     COLOR_Lab2RGB      = 57,
     COLOR_Luv2BGR      = 58,
     COLOR_Luv2RGB      = 59,
-    COLOR_HLS2BGR      = 60,
+    COLOR_HLS2BGR      = 60, //!< backward conversions HLS to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HLS2RGB      = 61,
 
-    COLOR_BGR2HSV_FULL = 66,
+    COLOR_BGR2HSV_FULL = 66, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV_FULL = 67,
-    COLOR_BGR2HLS_FULL = 68,
+    COLOR_BGR2HLS_FULL = 68, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS_FULL = 69,
 
-    COLOR_HSV2BGR_FULL = 70,
+    COLOR_HSV2BGR_FULL = 70, //!< backward conversions HSV to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HSV2RGB_FULL = 71,
-    COLOR_HLS2BGR_FULL = 72,
+    COLOR_HLS2BGR_FULL = 72, //!< backward conversions HLS to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HLS2RGB_FULL = 73,
 
     COLOR_LBGR2Lab     = 74,
@@ -1923,6 +1928,38 @@ CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
                                      InputArray mask, int blockSize,
                                      int gradientSize, bool useHarrisDetector = false,
                                      double k = 0.04 );
+
+/** @brief Same as above, but returns also quality measure of the detected corners.
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param corners Output vector of detected corners.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param cornersQuality Output vector of quality measure of the detected corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param gradientSize Aperture parameter for the Sobel operator used for derivatives computation.
+See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+ */
+CV_EXPORTS CV_WRAP_AS(goodFeaturesToTrackWithQuality) void goodFeaturesToTrack(
+        InputArray image, OutputArray corners,
+        int maxCorners, double qualityLevel, double minDistance,
+        InputArray mask, OutputArray cornersQuality, int blockSize = 3,
+        int gradientSize = 3, bool useHarrisDetector = false, double k = 0.04);
+
 /** @example samples/cpp/tutorial_code/ImgTrans/houghlines.cpp
 An example using the Hough line detector
 ![Sample input image](Hough_Lines_Tutorial_Original_Image.jpg) ![Output image](Hough_Lines_Tutorial_Result.jpg)
@@ -3152,6 +3189,9 @@ CV_EXPORTS_AS(EMD) float wrapperEMD( InputArray signature1, InputArray signature
 
 //! @} imgproc_hist
 
+//! @addtogroup imgproc_segmentation
+//! @{
+
 /** @example samples/cpp/watershed.cpp
 An example using the watershed algorithm
 */
@@ -3179,11 +3219,11 @@ function.
 size as image .
 
 @sa findContours
-
-@ingroup imgproc_misc
  */
 CV_EXPORTS_W void watershed( InputArray image, InputOutputArray markers );
 
+//! @} imgproc_segmentation
+
 //! @addtogroup imgproc_filter
 //! @{
 
@@ -3229,7 +3269,7 @@ CV_EXPORTS_W void pyrMeanShiftFiltering( InputArray src, OutputArray dst,
 
 //! @}
 
-//! @addtogroup imgproc_misc
+//! @addtogroup imgproc_segmentation
 //! @{
 
 /** @example samples/cpp/grabcut.cpp
@@ -3259,6 +3299,11 @@ CV_EXPORTS_W void grabCut( InputArray img, InputOutputArray mask, Rect rect,
                            InputOutputArray bgdModel, InputOutputArray fgdModel,
                            int iterCount, int mode = GC_EVAL );
 
+//! @} imgproc_segmentation
+
+//! @addtogroup imgproc_misc
+//! @{
+
 /** @example samples/cpp/distrans.cpp
 An example on using the distance transform
 */
@@ -3430,7 +3475,7 @@ CV_EXPORTS_W int floodFill( InputOutputArray image, InputOutputArray mask,
 //! @param weights1 It has a type of CV_32FC1 and the same size with src1.
 //! @param weights2 It has a type of CV_32FC1 and the same size with src1.
 //! @param dst It is created if it does not have the same size and type with src1.
-CV_EXPORTS void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);
+CV_EXPORTS_W void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);
 
 //! @} imgproc_misc
 
@@ -4908,4 +4953,8 @@ Point LineIterator::pos() const
 
 } // cv
 
+
+#include "./imgproc/segmentation.hpp"
+
+
 #endif
diff --git a/modules/imgproc/include/opencv2/imgproc/bindings.hpp b/modules/imgproc/include/opencv2/imgproc/bindings.hpp
new file mode 100644
index 0000000000..c69527a779
--- /dev/null
+++ b/modules/imgproc/include/opencv2/imgproc/bindings.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_BINDINGS_HPP
+#define OPENCV_IMGPROC_BINDINGS_HPP
+
+// This file contains special overloads for OpenCV bindings
+// No need to use these functions in C++ code.
+
+namespace cv {
+
+/** @brief Finds lines in a binary image using the standard Hough transform and get accumulator.
+ *
+ * @note This function is for bindings use only. Use original function in C++ code
+ *
+ * @sa HoughLines
+ */
+CV_WRAP static inline
+void HoughLinesWithAccumulator(
+        InputArray image, OutputArray lines,
+        double rho, double theta, int threshold,
+        double srn = 0, double stn = 0,
+        double min_theta = 0, double max_theta = CV_PI
+)
+{
+    std::vector<Vec3f> lines_acc;
+    HoughLines(image, lines_acc, rho, theta, threshold, srn, stn, min_theta, max_theta);
+    Mat(lines_acc).copyTo(lines);
+}
+
+}  // namespace
+
+#endif  // OPENCV_IMGPROC_BINDINGS_HPP
diff --git a/modules/imgproc/include/opencv2/imgproc/segmentation.hpp b/modules/imgproc/include/opencv2/imgproc/segmentation.hpp
new file mode 100644
index 0000000000..26882f444e
--- /dev/null
+++ b/modules/imgproc/include/opencv2/imgproc/segmentation.hpp
@@ -0,0 +1,141 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_SEGMENTATION_HPP
+#define OPENCV_IMGPROC_SEGMENTATION_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+namespace segmentation {
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+
+/** @brief Intelligent Scissors image segmentation
+ *
+ * This class is used to find the path (contour) between two points
+ * which can be used for image segmentation.
+ *
+ * Usage example:
+ * @snippet snippets/imgproc_segmentation.cpp usage_example_intelligent_scissors
+ *
+ * Reference: <a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.138.3811&rep=rep1&type=pdf">"Intelligent Scissors for Image Composition"</a>
+ * algorithm designed by Eric N. Mortensen and William A. Barrett, Brigham Young University
+ * @cite Mortensen95intelligentscissors
+ */
+class CV_EXPORTS_W_SIMPLE IntelligentScissorsMB
+{
+public:
+    CV_WRAP
+    IntelligentScissorsMB();
+
+    /** @brief Specify weights of feature functions
+     *
+     * Consider keeping weights normalized (sum of weights equals to 1.0)
+     * Discrete dynamic programming (DP) goal is minimization of costs between pixels.
+     *
+     * @param weight_non_edge Specify cost of non-edge pixels (default: 0.43f)
+     * @param weight_gradient_direction Specify cost of gradient direction function (default: 0.43f)
+     * @param weight_gradient_magnitude Specify cost of gradient magnitude function (default: 0.14f)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setWeights(float weight_non_edge, float weight_gradient_direction, float weight_gradient_magnitude);
+
+    /** @brief Specify gradient magnitude max value threshold
+     *
+     * Zero limit value is used to disable gradient magnitude thresholding (default behavior, as described in original article).
+     * Otherwize pixels with `gradient magnitude >= threshold` have zero cost.
+     *
+     * @note Thresholding should be used for images with irregular regions (to avoid stuck on parameters from high-contract areas, like embedded logos).
+     *
+     * @param gradient_magnitude_threshold_max Specify gradient magnitude max value threshold (default: 0, disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setGradientMagnitudeMaxLimit(float gradient_magnitude_threshold_max = 0.0f);
+
+    /** @brief Switch to "Laplacian Zero-Crossing" edge feature extractor and specify its parameters
+     *
+     * This feature extractor is used by default according to article.
+     *
+     * Implementation has additional filtering for regions with low-amplitude noise.
+     * This filtering is enabled through parameter of minimal gradient amplitude (use some small value 4, 8, 16).
+     *
+     * @note Current implementation of this feature extractor is based on processing of grayscale images (color image is converted to grayscale image first).
+     *
+     * @note Canny edge detector is a bit slower, but provides better results (especially on color images): use setEdgeFeatureCannyParameters().
+     *
+     * @param gradient_magnitude_min_value Minimal gradient magnitude value for edge pixels (default: 0, check is disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureZeroCrossingParameters(float gradient_magnitude_min_value = 0.0f);
+
+    /** @brief Switch edge feature extractor to use Canny edge detector
+     *
+     * @note "Laplacian Zero-Crossing" feature extractor is used by default (following to original article)
+     *
+     * @sa Canny
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureCannyParameters(
+            double threshold1, double threshold2,
+            int apertureSize = 3, bool L2gradient = false
+    );
+
+    /** @brief Specify input image and extract image features
+     *
+     * @param image input image. Type is #CV_8UC1 / #CV_8UC3
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImage(InputArray image);
+
+    /** @brief Specify custom features of imput image
+     *
+     * Customized advanced variant of applyImage() call.
+     *
+     * @param non_edge Specify cost of non-edge pixels. Type is CV_8UC1. Expected values are `{0, 1}`.
+     * @param gradient_direction Specify gradient direction feature. Type is CV_32FC2. Values are expected to be normalized: `x^2 + y^2 == 1`
+     * @param gradient_magnitude Specify cost of gradient magnitude function: Type is CV_32FC1. Values should be in range `[0, 1]`.
+     * @param image **Optional parameter**. Must be specified if subset of features is specified (non-specified features are calculated internally)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImageFeatures(
+            InputArray non_edge, InputArray gradient_direction, InputArray gradient_magnitude,
+            InputArray image = noArray()
+    );
+
+    /** @brief Prepares a map of optimal paths for the given source point on the image
+     *
+     * @note applyImage() / applyImageFeatures() must be called before this call
+     *
+     * @param sourcePt The source point used to find the paths
+     */
+    CV_WRAP void buildMap(const Point& sourcePt);
+
+    /** @brief Extracts optimal contour for the given target point on the image
+     *
+     * @note buildMap() must be called before this call
+     *
+     * @param targetPt The target point
+     * @param[out] contour The list of pixels which contains optimal path between the source and the target points of the image. Type is CV_32SC2 (compatible with `std::vector<Point>`)
+     * @param backward Flag to indicate reverse order of retrived pixels (use "true" value to fetch points from the target to the source point)
+     */
+    CV_WRAP void getContour(const Point& targetPt, OutputArray contour, bool backward = false) const;
+
+#ifndef CV_DOXYGEN
+    struct Impl;
+    inline Impl* getImpl() const { return impl.get(); }
+protected:
+    std::shared_ptr<Impl> impl;
+#endif
+};
+
+//! @}
+
+}  // namespace segmentation
+}  // namespace cv
+
+#endif // OPENCV_IMGPROC_SEGMENTATION_HPP
diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index 537f871331..6f18c91c93 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -1344,8 +1344,8 @@ public class ImgprocTest extends OpenCVTestCase {
 
         RotatedRect rrect = Imgproc.minAreaRect(points);
 
-        assertEquals(new Size(2, 5), rrect.size);
-        assertEquals(-90., rrect.angle);
+        assertEquals(new Size(5, 2), rrect.size);
+        assertEquals(0., rrect.angle);
         assertEquals(new Point(3.5, 2), rrect.center);
     }
 
diff --git a/modules/imgproc/misc/objc/gen_dict.json b/modules/imgproc/misc/objc/gen_dict.json
index e2cac520fc..76429dd2eb 100644
--- a/modules/imgproc/misc/objc/gen_dict.json
+++ b/modules/imgproc/misc/objc/gen_dict.json
@@ -2,6 +2,9 @@
     "class_ignore_list": [
         "FontFace"
     ],
+    "AdditionalImports" : {
+        "Imgproc" : [ "\"imgproc/bindings.hpp\"" ]
+    },
     "enum_ignore_list" : [
         "MorphShapes_c",
         "SmoothMethod_c"
diff --git a/modules/imgproc/misc/objc/test/ImgprocTest.swift b/modules/imgproc/misc/objc/test/ImgprocTest.swift
index 75675e89bd..7189c9bfc5 100644
--- a/modules/imgproc/misc/objc/test/ImgprocTest.swift
+++ b/modules/imgproc/misc/objc/test/ImgprocTest.swift
@@ -1128,8 +1128,8 @@ class ImgprocTest: OpenCVTestCase {
 
         let rrect = Imgproc.minAreaRect(points: points)
 
-        XCTAssertEqual(Size2f(width: 2, height: 5), rrect.size)
-        XCTAssertEqual(-90.0, rrect.angle)
+        XCTAssertEqual(Size2f(width: 5, height: 2), rrect.size)
+        XCTAssertEqual(0.0, rrect.angle)
         XCTAssertEqual(Point2f(x: 3.5, y: 2), rrect.center)
     }
 
diff --git a/modules/imgproc/perf/opencl/perf_gftt.cpp b/modules/imgproc/perf/opencl/perf_gftt.cpp
index a352e9933f..6d4e3e5913 100644
--- a/modules/imgproc/perf/opencl/perf_gftt.cpp
+++ b/modules/imgproc/perf/opencl/perf_gftt.cpp
@@ -82,6 +82,35 @@ OCL_PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrack,
     SANITY_CHECK(dst);
 }
 
+OCL_PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrackWithQuality,
+                ::testing::Combine(OCL_PERF_ENUM(String("gpu/opticalflow/rubberwhale1.png")),
+                                   OCL_PERF_ENUM(3.0), Bool()))
+{
+    GoodFeaturesToTrackParams params = GetParam();
+    const String fileName = get<0>(params);
+    const double minDistance = get<1>(params), qualityLevel = 0.01;
+    const bool harrisDetector = get<2>(params);
+    const int maxCorners = 1000;
+
+    Mat img = imread(getDataPath(fileName), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty()) << "could not load " << fileName;
+
+    checkDeviceMaxMemoryAllocSize(img.size(), img.type());
+
+    UMat src(img.size(), img.type()), dst(1, maxCorners, CV_32FC2);
+    img.copyTo(src);
+
+    std::vector<float> cornersQuality;
+
+    declare.in(src, WARMUP_READ).out(dst);
+
+    OCL_TEST_CYCLE() cv::goodFeaturesToTrack(src, dst, maxCorners, qualityLevel, minDistance,
+                                             noArray(), cornersQuality, 3, 3, harrisDetector, 0.04);
+
+    SANITY_CHECK(dst);
+    SANITY_CHECK(cornersQuality, 1e-6);
+}
+
 } } // namespace opencv_test::ocl
 
 #endif
diff --git a/modules/imgproc/perf/opencl/perf_imgproc.cpp b/modules/imgproc/perf/opencl/perf_imgproc.cpp
index 4b61976553..db4c4ef451 100644
--- a/modules/imgproc/perf/opencl/perf_imgproc.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@@ -166,7 +166,17 @@ OCL_PERF_TEST_P(CornerMinEigenValFixture, CornerMinEigenVal,
 
     OCL_TEST_CYCLE() cv::cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
 
-    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+#ifdef HAVE_OPENCL
+    bool strictCheck = !ocl::useOpenCL() || ocl::Device::getDefault().isIntel();
+#else
+    bool strictCheck = true;
+#endif
+
+    // using native_* OpenCL functions on non-intel devices may lose accuracy
+    if (strictCheck)
+        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    else
+        SANITY_CHECK(dst, 0.1, ERROR_RELATIVE);
 }
 
 ///////////// CornerHarris ////////////////////////
diff --git a/modules/imgproc/perf/perf_goodFeaturesToTrack.cpp b/modules/imgproc/perf/perf_goodFeaturesToTrack.cpp
index a344ff1176..9aa7b6e269 100644
--- a/modules/imgproc/perf/perf_goodFeaturesToTrack.cpp
+++ b/modules/imgproc/perf/perf_goodFeaturesToTrack.cpp
@@ -41,4 +41,37 @@ PERF_TEST_P(Image_MaxCorners_QualityLevel_MinDistance_BlockSize_gradientSize_Use
     SANITY_CHECK(corners);
 }
 
+PERF_TEST_P(Image_MaxCorners_QualityLevel_MinDistance_BlockSize_gradientSize_UseHarris, goodFeaturesToTrackWithQuality,
+            testing::Combine(
+                    testing::Values( "stitching/a1.png", "cv/shared/pic5.png"),
+                    testing::Values( 50 ),
+                    testing::Values( 0.01 ),
+                    testing::Values( 3 ),
+                    testing::Values( 3 ),
+                    testing::Bool()
+            )
+)
+{
+    string filename = getDataPath(get<0>(GetParam()));
+    int maxCorners = get<1>(GetParam());
+    double qualityLevel = get<2>(GetParam());
+    int blockSize = get<3>(GetParam());
+    int gradientSize = get<4>(GetParam());
+    bool useHarrisDetector = get<5>(GetParam());
+    double minDistance = 1;
+
+    Mat image = imread(filename, IMREAD_GRAYSCALE);
+    if (image.empty())
+        FAIL() << "Unable to load source image" << filename;
+
+    std::vector<Point2f> corners;
+    std::vector<float> cornersQuality;
+
+    TEST_CYCLE() goodFeaturesToTrack(image, corners, maxCorners, qualityLevel, minDistance, noArray(),
+                                     cornersQuality, blockSize, gradientSize, useHarrisDetector);
+
+    SANITY_CHECK(corners);
+    SANITY_CHECK(cornersQuality, 1e-6);
+}
+
 } // namespace
diff --git a/modules/imgproc/src/ccl_bolelli_forest.inc.hpp b/modules/imgproc/src/ccl_bolelli_forest.inc.hpp
new file mode 100644
index 0000000000..1aa17557d9
--- /dev/null
+++ b/modules/imgproc/src/ccl_bolelli_forest.inc.hpp
@@ -0,0 +1,1746 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// 2021 Federico Bolelli <federico.bolelli@unimore.it>
+// 2021 Stefano Allegretti <stefano.allegretti@unimore.it>
+// 2021 Costantino Grana <costantino.grana@unimore.it>
+//
+// This file has been automatically generated using GRAPHGEN (https://github.com/prittt/GRAPHGEN)
+// and taken from the YACCLAB repository (https://github.com/prittt/YACCLAB).
+tree_0: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_0; } else { goto break_1_0; } }
+        if (CONDITION_O) {
+            NODE_1:
+            if (CONDITION_J) {
+                ACTION_4
+                goto tree_11;
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_3:
+                    if (CONDITION_K) {
+                        if (CONDITION_I) {
+                            NODE_5:
+                            if (CONDITION_D) {
+                                ACTION_5
+                                goto tree_5;
+                            }
+                            else {
+                                ACTION_10
+                                goto tree_5;
+                            }
+                        }
+                        else {
+                            ACTION_5
+                            goto tree_5;
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto tree_4;
+                        }
+                        else {
+                            ACTION_2
+                            goto tree_3;
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_I) {
+                        ACTION_4
+                        goto tree_10;
+                    }
+                    else {
+                        ACTION_2
+                        goto tree_9;
+                    }
+                }
+            }
+        }
+        else {
+            NODE_8:
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    NODE_10:
+                    if (CONDITION_J) {
+                        ACTION_4
+                        goto tree_6;
+                    }
+                    else{
+                        goto NODE_3;
+                    }
+                }
+                else {
+                    ACTION_2
+                    goto tree_7;
+                }
+            }
+            else {
+                NODE_11:
+                if (CONDITION_P){
+                    goto NODE_10;
+                }
+                else {
+                    NODE_12:
+                    if (CONDITION_T) {
+                        ACTION_2
+                        goto tree_2;
+                    }
+                    else {
+                        ACTION_1
+                        goto tree_1;
+                    }
+                }
+            }
+        }
+tree_1: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_1; } else { goto break_1_1; } }
+        if (CONDITION_O) {
+            NODE_13:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                    goto tree_11;
+                }
+                else {
+                    if (CONDITION_H) {
+                        NODE_16:
+                        if (CONDITION_C) {
+                            ACTION_4
+                            goto tree_11;
+                        }
+                        else {
+                            ACTION_7
+                            goto tree_11;
+                        }
+                    }
+                    else {
+                        ACTION_4
+                        goto tree_11;
+                    }
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_I){
+                            goto NODE_5;
+                        }
+                        else {
+                            if (CONDITION_H) {
+                                NODE_21:
+                                if (CONDITION_D) {
+                                    if (CONDITION_C) {
+                                        ACTION_5
+                                        goto tree_5;
+                                    }
+                                    else {
+                                        ACTION_8
+                                        goto tree_5;
+                                    }
+                                }
+                                else {
+                                    ACTION_8
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                ACTION_5
+                                goto tree_5;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto tree_4;
+                        }
+                        else {
+                            if (CONDITION_H) {
+                                ACTION_3
+                                goto tree_3;
+                            }
+                            else {
+                                ACTION_2
+                                goto tree_3;
+                            }
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_I) {
+                        ACTION_4
+                        goto tree_10;
+                    }
+                    else {
+                        if (CONDITION_H) {
+                            ACTION_3
+                            goto tree_9;
+                        }
+                        else {
+                            ACTION_2
+                            goto tree_9;
+                        }
+                    }
+                }
+            }
+        }
+        else{
+            goto NODE_8;
+        }
+tree_2: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_2; } else { goto break_1_2; } }
+        if (CONDITION_O) {
+            NODE_27:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_11
+                    goto tree_11;
+                }
+                else {
+                    if (CONDITION_H) {
+                        if (CONDITION_C) {
+                            ACTION_11
+                            goto tree_11;
+                        }
+                        else {
+                            ACTION_14
+                            goto tree_11;
+                        }
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_I) {
+                            if (CONDITION_D) {
+                                ACTION_12
+                                goto tree_5;
+                            }
+                            else {
+                                ACTION_16
+                                goto tree_5;
+                            }
+                        }
+                        else {
+                            if (CONDITION_H) {
+                                if (CONDITION_D) {
+                                    if (CONDITION_C) {
+                                        ACTION_12
+                                        goto tree_5;
+                                    }
+                                    else {
+                                        ACTION_15
+                                        goto tree_5;
+                                    }
+                                }
+                                else {
+                                    ACTION_15
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                ACTION_12
+                                goto tree_5;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_H) {
+                            ACTION_9
+                            goto tree_8;
+                        }
+                        else {
+                            NODE_39:
+                            if (CONDITION_I) {
+                                ACTION_11
+                                goto tree_4;
+                            }
+                            else {
+                                ACTION_6
+                                goto tree_3;
+                            }
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_H) {
+                        ACTION_9
+                        goto tree_12;
+                    }
+                    else {
+                        NODE_41:
+                        if (CONDITION_I) {
+                            ACTION_11
+                            goto tree_10;
+                        }
+                        else {
+                            ACTION_6
+                            goto tree_9;
+                        }
+                    }
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    NODE_44:
+                    if (CONDITION_J) {
+                        ACTION_11
+                        goto tree_6;
+                    }
+                    else {
+                        NODE_45:
+                        if (CONDITION_K) {
+                            if (CONDITION_D) {
+                                ACTION_12
+                                goto tree_5;
+                            }
+                            else {
+                                if (CONDITION_I) {
+                                    ACTION_16
+                                    goto tree_5;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                        }
+                        else{
+                            goto NODE_39;
+                        }
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+tree_3: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_3; } else { goto break_1_3; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_11
+                goto tree_11;
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_50:
+                    if (CONDITION_K) {
+                        ACTION_12
+                        goto tree_5;
+                    }
+                    else {
+                        ACTION_6
+                        goto tree_8;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_12;
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        ACTION_11
+                        goto tree_6;
+                    }
+                    else{
+                        goto NODE_50;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else {
+                NODE_54:
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        ACTION_4
+                        goto tree_6;
+                    }
+                    else {
+                        if (CONDITION_K) {
+                            ACTION_5
+                            goto tree_5;
+                        }
+                        else {
+                            ACTION_2
+                            goto tree_3;
+                        }
+                    }
+                }
+                else{
+                    goto NODE_12;
+                }
+            }
+        }
+tree_4: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_3; } else { goto break_1_4; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    NODE_59:
+                    if (CONDITION_B) {
+                        ACTION_4
+                        goto tree_11;
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+                else {
+                    ACTION_11
+                    goto tree_11;
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_61:
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            if (CONDITION_C) {
+                                NODE_64:
+                                if (CONDITION_B) {
+                                    ACTION_5
+                                    goto tree_5;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                ACTION_12
+                                goto tree_5;
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto tree_5;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto tree_8;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_12;
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        if (CONDITION_C) {
+                            NODE_69:
+                            if (CONDITION_B) {
+                                ACTION_4
+                                goto tree_6;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_6;
+                            }
+                        }
+                        else {
+                            ACTION_11
+                            goto tree_6;
+                        }
+                    }
+                    else{
+                        goto NODE_61;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else{
+                goto NODE_54;
+            }
+        }
+tree_5: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_3; } else { goto break_1_5; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_4
+                goto tree_11;
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_72:
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            ACTION_5
+                            goto tree_5;
+                        }
+                        else {
+                            ACTION_12
+                            goto tree_5;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto tree_8;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_12;
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        ACTION_4
+                        goto tree_6;
+                    }
+                    else{
+                        goto NODE_72;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        ACTION_4
+                        goto tree_6;
+                    }
+                    else {
+                        if (CONDITION_K){
+                            goto NODE_5;
+                        }
+                        else {
+                            ACTION_4
+                            goto tree_4;
+                        }
+                    }
+                }
+                else{
+                    goto NODE_12;
+                }
+            }
+        }
+tree_6: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_3; } else { goto break_1_6; } }
+        if (CONDITION_O) {
+            NODE_80:
+            if (CONDITION_J) {
+                NODE_81:
+                if (CONDITION_I) {
+                    ACTION_4
+                    goto tree_11;
+                }
+                else {
+                    if (CONDITION_C) {
+                        ACTION_4
+                        goto tree_11;
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_84:
+                    if (CONDITION_K) {
+                        NODE_85:
+                        if (CONDITION_D) {
+                            NODE_86:
+                            if (CONDITION_I) {
+                                ACTION_5
+                                goto tree_5;
+                            }
+                            else {
+                                if (CONDITION_C) {
+                                    ACTION_5
+                                    goto tree_5;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto tree_5;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto tree_8;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_12;
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    NODE_90:
+                    if (CONDITION_J) {
+                        NODE_91:
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto tree_6;
+                        }
+                        else {
+                            if (CONDITION_C) {
+                                ACTION_4
+                                goto tree_6;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_6;
+                            }
+                        }
+                    }
+                    else{
+                        goto NODE_84;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+tree_7: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_4; } else { goto break_1_7; } }
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_27;
+            }
+            else{
+                goto NODE_13;
+            }
+        }
+        else {
+            NODE_94:
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_R){
+                        goto NODE_44;
+                    }
+                    else{
+                        goto NODE_10;
+                    }
+                }
+                else {
+                    NODE_97:
+                    if (CONDITION_R) {
+                        ACTION_6
+                        goto tree_7;
+                    }
+                    else {
+                        ACTION_2
+                        goto tree_7;
+                    }
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+tree_8: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_3; } else { goto break_1_8; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    if (CONDITION_G){
+                        goto NODE_59;
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+                else {
+                    ACTION_11
+                    goto tree_11;
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_102:
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            if (CONDITION_C) {
+                                if (CONDITION_G){
+                                    goto NODE_64;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                ACTION_12
+                                goto tree_5;
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto tree_5;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto tree_8;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_12;
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        if (CONDITION_C) {
+                            if (CONDITION_G){
+                                goto NODE_69;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_6;
+                            }
+                        }
+                        else {
+                            ACTION_11
+                            goto tree_6;
+                        }
+                    }
+                    else{
+                        goto NODE_102;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto tree_7;
+                }
+            }
+            else{
+                goto NODE_54;
+            }
+        }
+tree_9: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_5; } else { goto break_1_9; } }
+        if (CONDITION_O) {
+            if (CONDITION_R) {
+                if (CONDITION_J) {
+                    ACTION_11
+                    goto tree_11;
+                }
+                else {
+                    if (CONDITION_P){
+                        goto NODE_45;
+                    }
+                    else{
+                        goto NODE_41;
+                    }
+                }
+            }
+            else{
+                goto NODE_1;
+            }
+        }
+        else{
+            goto NODE_94;
+        }
+tree_10: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_6; } else { goto break_1_10; } }
+        if (CONDITION_O) {
+            if (CONDITION_R) {
+                if (CONDITION_J) {
+                    NODE_116:
+                    if (CONDITION_B){
+                        goto NODE_81;
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+                else {
+                    if (CONDITION_P) {
+                        NODE_118:
+                        if (CONDITION_K) {
+                            if (CONDITION_D) {
+                                NODE_120:
+                                if (CONDITION_B){
+                                    goto NODE_86;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                if (CONDITION_I) {
+                                    NODE_122:
+                                    if (CONDITION_B) {
+                                        ACTION_12
+                                        goto tree_5;
+                                    }
+                                    else {
+                                        ACTION_16
+                                        goto tree_5;
+                                    }
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                        }
+                        else {
+                            if (CONDITION_I) {
+                                NODE_124:
+                                if (CONDITION_B) {
+                                    ACTION_4
+                                    goto tree_4;
+                                }
+                                else {
+                                    ACTION_11
+                                    goto tree_4;
+                                }
+                            }
+                            else {
+                                ACTION_6
+                                goto tree_3;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            NODE_126:
+                            if (CONDITION_B) {
+                                ACTION_4
+                                goto tree_10;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_10;
+                            }
+                        }
+                        else {
+                            ACTION_6
+                            goto tree_9;
+                        }
+                    }
+                }
+            }
+            else{
+                goto NODE_1;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_R) {
+                        if (CONDITION_J) {
+                            NODE_131:
+                            if (CONDITION_B){
+                                goto NODE_91;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_6;
+                            }
+                        }
+                        else{
+                            goto NODE_118;
+                        }
+                    }
+                    else{
+                        goto NODE_10;
+                    }
+                }
+                else{
+                    goto NODE_97;
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+tree_11: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_7; } else { goto break_1_11; } }
+        if (CONDITION_O) {
+            if (CONDITION_N){
+                goto NODE_80;
+            }
+            else {
+                if (CONDITION_R){
+                    goto NODE_80;
+                }
+                else {
+                    if (CONDITION_J) {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto tree_11;
+                        }
+                        else{
+                            goto NODE_16;
+                        }
+                    }
+                    else {
+                        if (CONDITION_P) {
+                            if (CONDITION_K) {
+                                if (CONDITION_I){
+                                    goto NODE_5;
+                                }
+                                else{
+                                    goto NODE_21;
+                                }
+                            }
+                            else {
+                                if (CONDITION_I) {
+                                    ACTION_4
+                                    goto tree_4;
+                                }
+                                else {
+                                    ACTION_3
+                                    goto tree_3;
+                                }
+                            }
+                        }
+                        else {
+                            if (CONDITION_I) {
+                                ACTION_4
+                                goto tree_10;
+                            }
+                            else {
+                                ACTION_3
+                                goto tree_9;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_N){
+                        goto NODE_90;
+                    }
+                    else {
+                        if (CONDITION_R) {
+                            if (CONDITION_J){
+                                goto NODE_91;
+                            }
+                            else {
+                                if (CONDITION_K){
+                                    goto NODE_85;
+                                }
+                                else {
+                                    if (CONDITION_I) {
+                                        ACTION_4
+                                        goto tree_4;
+                                    }
+                                    else {
+                                        ACTION_6
+                                        goto tree_3;
+                                    }
+                                }
+                            }
+                        }
+                        else{
+                            goto NODE_10;
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_R) {
+                        ACTION_6
+                        goto tree_7;
+                    }
+                    else {
+                        if (CONDITION_N) {
+                            ACTION_6
+                            goto tree_7;
+                        }
+                        else {
+                            ACTION_2
+                            goto tree_7;
+                        }
+                    }
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+tree_12: if ((c+=2) >= w - 2) { if (c > w - 2) { goto break_0_8; } else { goto break_1_12; } }
+        if (CONDITION_O) {
+            if (CONDITION_R) {
+                if (CONDITION_J) {
+                    if (CONDITION_G){
+                        goto NODE_116;
+                    }
+                    else {
+                        ACTION_11
+                        goto tree_11;
+                    }
+                }
+                else {
+                    if (CONDITION_P) {
+                        NODE_154:
+                        if (CONDITION_K) {
+                            if (CONDITION_D) {
+                                if (CONDITION_G){
+                                    goto NODE_120;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                            else {
+                                if (CONDITION_I) {
+                                    if (CONDITION_G){
+                                        goto NODE_122;
+                                    }
+                                    else {
+                                        ACTION_16
+                                        goto tree_5;
+                                    }
+                                }
+                                else {
+                                    ACTION_12
+                                    goto tree_5;
+                                }
+                            }
+                        }
+                        else {
+                            if (CONDITION_I) {
+                                if (CONDITION_G){
+                                    goto NODE_124;
+                                }
+                                else {
+                                    ACTION_11
+                                    goto tree_4;
+                                }
+                            }
+                            else {
+                                ACTION_6
+                                goto tree_3;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            if (CONDITION_G){
+                                goto NODE_126;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_10;
+                            }
+                        }
+                        else {
+                            ACTION_6
+                            goto tree_9;
+                        }
+                    }
+                }
+            }
+            else{
+                goto NODE_1;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_R) {
+                        if (CONDITION_J) {
+                            if (CONDITION_G){
+                                goto NODE_131;
+                            }
+                            else {
+                                ACTION_11
+                                goto tree_6;
+                            }
+                        }
+                        else{
+                            goto NODE_154;
+                        }
+                    }
+                    else{
+                        goto NODE_10;
+                    }
+                }
+                else{
+                    goto NODE_97;
+                }
+            }
+            else{
+                goto NODE_11;
+            }
+        }
+break_0_0:
+        if (CONDITION_O) {
+            NODE_168:
+            if (CONDITION_I) {
+                ACTION_4
+            }
+            else {
+                ACTION_2
+            }
+        }
+        else {
+            NODE_169:
+            if (CONDITION_S) {
+                ACTION_2
+            }
+            else {
+                ACTION_1
+            }
+        }
+    continue;
+break_0_1:
+        if (CONDITION_O) {
+            NODE_170:
+            if (CONDITION_I) {
+                ACTION_4
+            }
+            else {
+                if (CONDITION_H) {
+                    ACTION_3
+                }
+                else {
+                    ACTION_2
+                }
+            }
+        }
+        else{
+            goto NODE_169;
+        }
+    continue;
+break_0_2:
+        if (CONDITION_O) {
+            NODE_172:
+            if (CONDITION_H) {
+                ACTION_9
+            }
+            else {
+                NODE_173:
+                if (CONDITION_I) {
+                    ACTION_11
+                }
+                else {
+                    ACTION_6
+                }
+            }
+        }
+        else {
+            NODE_174:
+            if (CONDITION_S) {
+                ACTION_6
+            }
+            else {
+                ACTION_1
+            }
+        }
+    continue;
+break_0_3:
+        if (CONDITION_O) {
+            ACTION_6
+        }
+        else{
+            goto NODE_174;
+        }
+    continue;
+break_0_4:
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_172;
+            }
+            else{
+                goto NODE_170;
+            }
+        }
+        else {
+            NODE_176:
+            if (CONDITION_S) {
+                NODE_177:
+                if (CONDITION_R) {
+                    ACTION_6
+                }
+                else {
+                    ACTION_2
+                }
+            }
+            else {
+                ACTION_1
+            }
+        }
+    continue;
+break_0_5:
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_173;
+            }
+            else{
+                goto NODE_168;
+            }
+        }
+        else{
+            goto NODE_176;
+        }
+    continue;
+break_0_6:
+        if (CONDITION_O) {
+            if (CONDITION_R) {
+                NODE_180:
+                if (CONDITION_I) {
+                    NODE_181:
+                    if (CONDITION_B) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_168;
+            }
+        }
+        else{
+            goto NODE_176;
+        }
+    continue;
+break_0_7:
+        if (CONDITION_O) {
+            if (CONDITION_N) {
+                ACTION_6
+            }
+            else {
+                if (CONDITION_R) {
+                    ACTION_6
+                }
+                else {
+                    NODE_184:
+                    if (CONDITION_I) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_3
+                    }
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                NODE_186:
+                if (CONDITION_R) {
+                    ACTION_6
+                }
+                else {
+                    if (CONDITION_N) {
+                        ACTION_6
+                    }
+                    else {
+                        ACTION_2
+                    }
+                }
+            }
+            else {
+                ACTION_1
+            }
+        }
+    continue;
+break_0_8:
+        if (CONDITION_O) {
+            if (CONDITION_R) {
+                NODE_189:
+                if (CONDITION_I) {
+                    NODE_190:
+                    if (CONDITION_G){
+                        goto NODE_181;
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_168;
+            }
+        }
+        else{
+            goto NODE_176;
+        }
+    continue;
+break_1_0:
+        if (CONDITION_O) {
+            NODE_191:
+            if (CONDITION_J) {
+                ACTION_4
+            }
+            else{
+                goto NODE_168;
+            }
+        }
+        else {
+            NODE_192:
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_191;
+                }
+                else {
+                    ACTION_2
+                }
+            }
+            else {
+                NODE_194:
+                if (CONDITION_P){
+                    goto NODE_191;
+                }
+                else {
+                    NODE_195:
+                    if (CONDITION_T) {
+                        ACTION_2
+                    }
+                    else {
+                        ACTION_1
+                    }
+                }
+            }
+        }
+    continue;
+break_1_1:
+        if (CONDITION_O) {
+            NODE_196:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                }
+                else {
+                    if (CONDITION_H) {
+                        NODE_199:
+                        if (CONDITION_C) {
+                            ACTION_4
+                        }
+                        else {
+                            ACTION_7
+                        }
+                    }
+                    else {
+                        ACTION_4
+                    }
+                }
+            }
+            else{
+                goto NODE_170;
+            }
+        }
+        else{
+            goto NODE_192;
+        }
+    continue;
+break_1_2:
+        if (CONDITION_O) {
+            NODE_200:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_11
+                }
+                else {
+                    if (CONDITION_H) {
+                        if (CONDITION_C) {
+                            ACTION_11
+                        }
+                        else {
+                            ACTION_14
+                        }
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+            }
+            else{
+                goto NODE_172;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    NODE_206:
+                    if (CONDITION_J) {
+                        ACTION_11
+                    }
+                    else{
+                        goto NODE_173;
+                    }
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
+break_1_3:
+        if (CONDITION_O) {
+            NODE_207:
+            if (CONDITION_J) {
+                ACTION_11
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_207;
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else {
+                NODE_210:
+                if (CONDITION_P) {
+                    if (CONDITION_J) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_2
+                    }
+                }
+                else{
+                    goto NODE_195;
+                }
+            }
+        }
+    continue;
+break_1_4:
+        if (CONDITION_O) {
+            NODE_212:
+            if (CONDITION_J) {
+                if (CONDITION_C){
+                    goto NODE_181;
+                }
+                else {
+                    ACTION_11
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_212;
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_210;
+            }
+        }
+    continue;
+break_1_5:
+        if (CONDITION_O) {
+            NODE_216:
+            if (CONDITION_J) {
+                ACTION_4
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_216;
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    ACTION_4
+                }
+                else{
+                    goto NODE_195;
+                }
+            }
+        }
+    continue;
+break_1_6:
+        if (CONDITION_O) {
+            NODE_220:
+            if (CONDITION_J) {
+                NODE_221:
+                if (CONDITION_I) {
+                    ACTION_4
+                }
+                else {
+                    if (CONDITION_C) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_220;
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
+break_1_7:
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_200;
+            }
+            else{
+                goto NODE_196;
+            }
+        }
+        else {
+            NODE_226:
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    NODE_228:
+                    if (CONDITION_R){
+                        goto NODE_206;
+                    }
+                    else{
+                        goto NODE_191;
+                    }
+                }
+                else{
+                    goto NODE_177;
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
+break_1_8:
+        if (CONDITION_O) {
+            NODE_229:
+            if (CONDITION_J) {
+                if (CONDITION_C){
+                    goto NODE_190;
+                }
+                else {
+                    ACTION_11
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_229;
+                }
+                else {
+                    ACTION_6
+                }
+            }
+            else{
+                goto NODE_210;
+            }
+        }
+    continue;
+break_1_9:
+        if (CONDITION_O){
+            goto NODE_228;
+        }
+        else{
+            goto NODE_226;
+        }
+    continue;
+break_1_10:
+        if (CONDITION_O) {
+            NODE_233:
+            if (CONDITION_R) {
+                if (CONDITION_J) {
+                    NODE_235:
+                    if (CONDITION_B){
+                        goto NODE_221;
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+                else{
+                    goto NODE_180;
+                }
+            }
+            else{
+                goto NODE_191;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_233;
+                }
+                else{
+                    goto NODE_177;
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
+break_1_11:
+        if (CONDITION_O) {
+            if (CONDITION_N){
+                goto NODE_220;
+            }
+            else {
+                if (CONDITION_R){
+                    goto NODE_220;
+                }
+                else {
+                    if (CONDITION_J) {
+                        if (CONDITION_I) {
+                            ACTION_4
+                        }
+                        else{
+                            goto NODE_199;
+                        }
+                    }
+                    else{
+                        goto NODE_184;
+                    }
+                }
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_N){
+                        goto NODE_220;
+                    }
+                    else {
+                        if (CONDITION_R) {
+                            if (CONDITION_J){
+                                goto NODE_221;
+                            }
+                            else {
+                                if (CONDITION_I) {
+                                    ACTION_4
+                                }
+                                else {
+                                    ACTION_6
+                                }
+                            }
+                        }
+                        else{
+                            goto NODE_191;
+                        }
+                    }
+                }
+                else{
+                    goto NODE_186;
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
+break_1_12:
+        if (CONDITION_O) {
+            NODE_248:
+            if (CONDITION_R) {
+                if (CONDITION_J) {
+                    if (CONDITION_G){
+                        goto NODE_235;
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+                else{
+                    goto NODE_189;
+                }
+            }
+            else{
+                goto NODE_191;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_248;
+                }
+                else{
+                    goto NODE_177;
+                }
+            }
+            else{
+                goto NODE_194;
+            }
+        }
+    continue;
diff --git a/modules/imgproc/src/ccl_bolelli_forest_firstline.inc.hpp b/modules/imgproc/src/ccl_bolelli_forest_firstline.inc.hpp
new file mode 100644
index 0000000000..b452b9ff4f
--- /dev/null
+++ b/modules/imgproc/src/ccl_bolelli_forest_firstline.inc.hpp
@@ -0,0 +1,218 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// 2021 Federico Bolelli <federico.bolelli@unimore.it>
+// 2021 Stefano Allegretti <stefano.allegretti@unimore.it>
+// 2021 Costantino Grana <costantino.grana@unimore.it>
+//
+// This file has been automatically generated using GRAPHGEN (https://github.com/prittt/GRAPHGEN)
+// and taken from the YACCLAB repository (https://github.com/prittt/YACCLAB).
+fl_tree_0: if ((c+=2) >= w - 2) { if (c > w - 2) { goto fl_break_0_0; } else { goto fl_break_1_0; } }
+        if (CONDITION_O) {
+            NODE_253:
+            if (CONDITION_P) {
+                ACTION_2
+                goto fl_tree_1;
+            }
+            else {
+                ACTION_2
+                goto fl_tree_2;
+            }
+        }
+        else {
+            if (CONDITION_S){
+                goto NODE_253;
+            }
+            else {
+                NODE_255:
+                if (CONDITION_P) {
+                    ACTION_2
+                    goto fl_tree_1;
+                }
+                else {
+                    if (CONDITION_T) {
+                        ACTION_2
+                        goto fl_tree_1;
+                    }
+                    else {
+                        ACTION_1
+                        goto fl_tree_0;
+                    }
+                }
+            }
+        }
+fl_tree_1: if ((c+=2) >= w - 2) { if (c > w - 2) { goto fl_break_0_1; } else { goto fl_break_1_1; } }
+        if (CONDITION_O) {
+            NODE_257:
+            if (CONDITION_P) {
+                ACTION_6
+                goto fl_tree_1;
+            }
+            else {
+                ACTION_6
+                goto fl_tree_2;
+            }
+        }
+        else {
+            if (CONDITION_S){
+                goto NODE_257;
+            }
+            else{
+                goto NODE_255;
+            }
+        }
+fl_tree_2: if ((c+=2) >= w - 2) { if (c > w - 2) { goto fl_break_0_2; } else { goto fl_break_1_2; } }
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_257;
+            }
+            else{
+                goto NODE_253;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P) {
+                    if (CONDITION_R) {
+                        ACTION_6
+                        goto fl_tree_1;
+                    }
+                    else {
+                        ACTION_2
+                        goto fl_tree_1;
+                    }
+                }
+                else {
+                    if (CONDITION_R) {
+                        ACTION_6
+                        goto fl_tree_2;
+                    }
+                    else {
+                        ACTION_2
+                        goto fl_tree_2;
+                    }
+                }
+            }
+            else{
+                goto NODE_255;
+            }
+        }
+fl_break_0_0:
+        if (CONDITION_O) {
+            ACTION_2
+        }
+        else {
+            if (CONDITION_S) {
+                ACTION_2
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto end_fl;
+fl_break_0_1:
+        if (CONDITION_O) {
+            ACTION_6
+        }
+        else {
+            if (CONDITION_S) {
+                ACTION_6
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto end_fl;
+fl_break_0_2:
+        if (CONDITION_O) {
+            NODE_266:
+            if (CONDITION_R) {
+                ACTION_6
+            }
+            else {
+                ACTION_2
+            }
+        }
+        else {
+            if (CONDITION_S){
+                goto NODE_266;
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto end_fl;
+fl_break_1_0:
+        if (CONDITION_O) {
+            NODE_268:
+            if (CONDITION_P) {
+                ACTION_2
+            }
+            else {
+                ACTION_2
+            }
+        }
+        else {
+            if (CONDITION_S){
+                goto NODE_268;
+            }
+            else {
+                NODE_270:
+                if (CONDITION_P) {
+                    ACTION_2
+                }
+                else {
+                    if (CONDITION_T) {
+                        ACTION_2
+                    }
+                    else {
+                        ACTION_1
+                    }
+                }
+            }
+        }
+    goto end_fl;
+fl_break_1_1:
+        if (CONDITION_O) {
+            NODE_272:
+            if (CONDITION_P) {
+                ACTION_6
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_S){
+                goto NODE_272;
+            }
+            else{
+                goto NODE_270;
+            }
+        }
+    goto end_fl;
+fl_break_1_2:
+        if (CONDITION_O) {
+            if (CONDITION_R){
+                goto NODE_272;
+            }
+            else{
+                goto NODE_268;
+            }
+        }
+        else {
+            if (CONDITION_S) {
+                if (CONDITION_P){
+                    goto NODE_266;
+                }
+                else{
+                    goto NODE_266;
+                }
+            }
+            else{
+                goto NODE_270;
+            }
+        }
+    goto end_fl;
+end_fl:;
diff --git a/modules/imgproc/src/ccl_bolelli_forest_lastline.inc.hpp b/modules/imgproc/src/ccl_bolelli_forest_lastline.inc.hpp
new file mode 100644
index 0000000000..09c1691738
--- /dev/null
+++ b/modules/imgproc/src/ccl_bolelli_forest_lastline.inc.hpp
@@ -0,0 +1,731 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// 2021 Federico Bolelli <federico.bolelli@unimore.it>
+// 2021 Stefano Allegretti <stefano.allegretti@unimore.it>
+// 2021 Costantino Grana <costantino.grana@unimore.it>
+//
+// This file has been automatically generated using GRAPHGEN (https://github.com/prittt/GRAPHGEN)
+// and taken from the YACCLAB repository (https://github.com/prittt/YACCLAB).
+ll_tree_0: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_0; } else { goto ll_break_1_0; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_4
+                goto ll_tree_6;
+            }
+            else {
+                if (CONDITION_P) {
+                    NODE_277:
+                    if (CONDITION_K) {
+                        if (CONDITION_I) {
+                            NODE_279:
+                            if (CONDITION_D) {
+                                ACTION_5
+                                goto ll_tree_4;
+                            }
+                            else {
+                                ACTION_10
+                                goto ll_tree_4;
+                            }
+                        }
+                        else {
+                            ACTION_5
+                            goto ll_tree_4;
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto ll_tree_3;
+                        }
+                        else {
+                            ACTION_2
+                            goto ll_tree_2;
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_I) {
+                        ACTION_4
+                        goto ll_tree_0;
+                    }
+                    else {
+                        ACTION_2
+                        goto ll_tree_0;
+                    }
+                }
+            }
+        }
+        else {
+            NODE_282:
+            if (CONDITION_P) {
+                if (CONDITION_J) {
+                    ACTION_4
+                    goto ll_tree_5;
+                }
+                else{
+                    goto NODE_277;
+                }
+            }
+            else {
+                ACTION_1
+                goto ll_tree_1;
+            }
+        }
+ll_tree_1: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_1; } else { goto ll_break_1_1; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                    goto ll_tree_6;
+                }
+                else {
+                    if (CONDITION_H) {
+                        NODE_287:
+                        if (CONDITION_C) {
+                            ACTION_4
+                            goto ll_tree_6;
+                        }
+                        else {
+                            ACTION_7
+                            goto ll_tree_6;
+                        }
+                    }
+                    else {
+                        ACTION_4
+                        goto ll_tree_6;
+                    }
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_I){
+                            goto NODE_279;
+                        }
+                        else {
+                            if (CONDITION_H) {
+                                NODE_292:
+                                if (CONDITION_D) {
+                                    if (CONDITION_C) {
+                                        ACTION_5
+                                        goto ll_tree_4;
+                                    }
+                                    else {
+                                        ACTION_8
+                                        goto ll_tree_4;
+                                    }
+                                }
+                                else {
+                                    ACTION_8
+                                    goto ll_tree_4;
+                                }
+                            }
+                            else {
+                                ACTION_5
+                                goto ll_tree_4;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto ll_tree_3;
+                        }
+                        else {
+                            if (CONDITION_H) {
+                                ACTION_3
+                                goto ll_tree_2;
+                            }
+                            else {
+                                ACTION_2
+                                goto ll_tree_2;
+                            }
+                        }
+                    }
+                }
+                else {
+                    if (CONDITION_I) {
+                        ACTION_4
+                        goto ll_tree_0;
+                    }
+                    else {
+                        if (CONDITION_H) {
+                            ACTION_3
+                            goto ll_tree_0;
+                        }
+                        else {
+                            ACTION_2
+                            goto ll_tree_0;
+                        }
+                    }
+                }
+            }
+        }
+        else{
+            goto NODE_282;
+        }
+ll_tree_2: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_2; } else { goto ll_break_1_2; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_11
+                goto ll_tree_6;
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        ACTION_12
+                        goto ll_tree_4;
+                    }
+                    else {
+                        ACTION_6
+                        goto ll_tree_7;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto ll_tree_0;
+                }
+            }
+        }
+        else {
+            NODE_301:
+            if (CONDITION_P) {
+                if (CONDITION_J) {
+                    ACTION_4
+                    goto ll_tree_5;
+                }
+                else {
+                    if (CONDITION_K) {
+                        ACTION_5
+                        goto ll_tree_4;
+                    }
+                    else {
+                        ACTION_2
+                        goto ll_tree_2;
+                    }
+                }
+            }
+            else {
+                ACTION_1
+                goto ll_tree_1;
+            }
+        }
+ll_tree_3: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_2; } else { goto ll_break_1_3; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    NODE_306:
+                    if (CONDITION_B) {
+                        ACTION_4
+                        goto ll_tree_6;
+                    }
+                    else {
+                        ACTION_7
+                        goto ll_tree_6;
+                    }
+                }
+                else {
+                    ACTION_11
+                    goto ll_tree_6;
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            if (CONDITION_C) {
+                                NODE_311:
+                                if (CONDITION_B) {
+                                    ACTION_5
+                                    goto ll_tree_4;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto ll_tree_4;
+                                }
+                            }
+                            else {
+                                ACTION_12
+                                goto ll_tree_4;
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto ll_tree_4;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto ll_tree_7;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto ll_tree_0;
+                }
+            }
+        }
+        else{
+            goto NODE_301;
+        }
+ll_tree_4: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_2; } else { goto ll_break_1_4; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_4
+                goto ll_tree_6;
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            ACTION_5
+                            goto ll_tree_4;
+                        }
+                        else {
+                            ACTION_12
+                            goto ll_tree_4;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto ll_tree_7;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto ll_tree_0;
+                }
+            }
+        }
+        else {
+            if (CONDITION_P) {
+                if (CONDITION_J) {
+                    ACTION_4
+                    goto ll_tree_5;
+                }
+                else {
+                    if (CONDITION_K){
+                        goto NODE_279;
+                    }
+                    else {
+                        ACTION_4
+                        goto ll_tree_3;
+                    }
+                }
+            }
+            else {
+                ACTION_1
+                goto ll_tree_1;
+            }
+        }
+ll_tree_5: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_2; } else { goto ll_break_1_5; } }
+        if (CONDITION_O) {
+            NODE_319:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                    goto ll_tree_6;
+                }
+                else {
+                    if (CONDITION_C) {
+                        ACTION_4
+                        goto ll_tree_6;
+                    }
+                    else {
+                        ACTION_11
+                        goto ll_tree_6;
+                    }
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            if (CONDITION_I) {
+                                ACTION_5
+                                goto ll_tree_4;
+                            }
+                            else {
+                                if (CONDITION_C) {
+                                    ACTION_5
+                                    goto ll_tree_4;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto ll_tree_4;
+                                }
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto ll_tree_4;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto ll_tree_7;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto ll_tree_0;
+                }
+            }
+        }
+        else{
+            goto NODE_282;
+        }
+ll_tree_6: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_3; } else { goto ll_break_1_6; } }
+        if (CONDITION_O) {
+            if (CONDITION_N){
+                goto NODE_319;
+            }
+            else {
+                if (CONDITION_J) {
+                    if (CONDITION_I) {
+                        ACTION_4
+                        goto ll_tree_6;
+                    }
+                    else{
+                        goto NODE_287;
+                    }
+                }
+                else {
+                    if (CONDITION_P) {
+                        if (CONDITION_K) {
+                            if (CONDITION_I){
+                                goto NODE_279;
+                            }
+                            else{
+                                goto NODE_292;
+                            }
+                        }
+                        else {
+                            if (CONDITION_I) {
+                                ACTION_4
+                                goto ll_tree_3;
+                            }
+                            else {
+                                ACTION_3
+                                goto ll_tree_2;
+                            }
+                        }
+                    }
+                    else {
+                        if (CONDITION_I) {
+                            ACTION_4
+                            goto ll_tree_0;
+                        }
+                        else {
+                            ACTION_3
+                            goto ll_tree_0;
+                        }
+                    }
+                }
+            }
+        }
+        else{
+            goto NODE_282;
+        }
+ll_tree_7: if ((c+=2) >= w - 2) { if (c > w - 2) { goto ll_break_0_2; } else { goto ll_break_1_7; } }
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    if (CONDITION_G){
+                        goto NODE_306;
+                    }
+                    else {
+                        ACTION_11
+                        goto ll_tree_6;
+                    }
+                }
+                else {
+                    ACTION_11
+                    goto ll_tree_6;
+                }
+            }
+            else {
+                if (CONDITION_P) {
+                    if (CONDITION_K) {
+                        if (CONDITION_D) {
+                            if (CONDITION_C) {
+                                if (CONDITION_G){
+                                    goto NODE_311;
+                                }
+                                else {
+                                    ACTION_12
+                                    goto ll_tree_4;
+                                }
+                            }
+                            else {
+                                ACTION_12
+                                goto ll_tree_4;
+                            }
+                        }
+                        else {
+                            ACTION_12
+                            goto ll_tree_4;
+                        }
+                    }
+                    else {
+                        ACTION_6
+                        goto ll_tree_7;
+                    }
+                }
+                else {
+                    ACTION_6
+                    goto ll_tree_0;
+                }
+            }
+        }
+        else{
+            goto NODE_301;
+        }
+ll_break_0_0:
+        if (CONDITION_O) {
+            NODE_343:
+            if (CONDITION_I) {
+                ACTION_4
+            }
+            else {
+                ACTION_2
+            }
+        }
+        else {
+            ACTION_1
+        }
+    goto ll_end;
+ll_break_0_1:
+        if (CONDITION_O) {
+            NODE_344:
+            if (CONDITION_I) {
+                ACTION_4
+            }
+            else {
+                if (CONDITION_H) {
+                    ACTION_3
+                }
+                else {
+                    ACTION_2
+                }
+            }
+        }
+        else {
+            ACTION_1
+        }
+    goto ll_end;
+ll_break_0_2:
+        if (CONDITION_O) {
+            ACTION_6
+        }
+        else {
+            ACTION_1
+        }
+    goto ll_end;
+ll_break_0_3:
+        if (CONDITION_O) {
+            if (CONDITION_N) {
+                ACTION_6
+            }
+            else {
+                NODE_347:
+                if (CONDITION_I) {
+                    ACTION_4
+                }
+                else {
+                    ACTION_3
+                }
+            }
+        }
+        else {
+            ACTION_1
+        }
+    goto ll_end;
+ll_break_1_0:
+        if (CONDITION_O) {
+            NODE_348:
+            if (CONDITION_J) {
+                ACTION_4
+            }
+            else{
+                goto NODE_343;
+            }
+        }
+        else {
+            NODE_349:
+            if (CONDITION_P){
+                goto NODE_348;
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto ll_end;
+ll_break_1_1:
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                }
+                else {
+                    if (CONDITION_H) {
+                        NODE_353:
+                        if (CONDITION_C) {
+                            ACTION_4
+                        }
+                        else {
+                            ACTION_7
+                        }
+                    }
+                    else {
+                        ACTION_4
+                    }
+                }
+            }
+            else{
+                goto NODE_344;
+            }
+        }
+        else{
+            goto NODE_349;
+        }
+    goto ll_end;
+ll_break_1_2:
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_11
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            NODE_355:
+            if (CONDITION_P) {
+                if (CONDITION_J) {
+                    ACTION_4
+                }
+                else {
+                    ACTION_2
+                }
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto ll_end;
+ll_break_1_3:
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    NODE_359:
+                    if (CONDITION_B) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_7
+                    }
+                }
+                else {
+                    ACTION_11
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else{
+            goto NODE_355;
+        }
+    goto ll_end;
+ll_break_1_4:
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                ACTION_4
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else {
+            if (CONDITION_P) {
+                ACTION_4
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto ll_end;
+ll_break_1_5:
+        if (CONDITION_O) {
+            NODE_362:
+            if (CONDITION_J) {
+                if (CONDITION_I) {
+                    ACTION_4
+                }
+                else {
+                    if (CONDITION_C) {
+                        ACTION_4
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else{
+            goto NODE_349;
+        }
+    goto ll_end;
+ll_break_1_6:
+        if (CONDITION_O) {
+            if (CONDITION_N){
+                goto NODE_362;
+            }
+            else {
+                if (CONDITION_J) {
+                    if (CONDITION_I) {
+                        ACTION_4
+                    }
+                    else{
+                        goto NODE_353;
+                    }
+                }
+                else{
+                    goto NODE_347;
+                }
+            }
+        }
+        else{
+            goto NODE_349;
+        }
+    goto ll_end;
+ll_break_1_7:
+        if (CONDITION_O) {
+            if (CONDITION_J) {
+                if (CONDITION_C) {
+                    if (CONDITION_G){
+                        goto NODE_359;
+                    }
+                    else {
+                        ACTION_11
+                    }
+                }
+                else {
+                    ACTION_11
+                }
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else{
+            goto NODE_355;
+        }
+    goto ll_end;
+ll_end:;
diff --git a/modules/imgproc/src/ccl_bolelli_forest_singleline.inc.hpp b/modules/imgproc/src/ccl_bolelli_forest_singleline.inc.hpp
new file mode 100644
index 0000000000..034b4ff6f7
--- /dev/null
+++ b/modules/imgproc/src/ccl_bolelli_forest_singleline.inc.hpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// 2021 Federico Bolelli <federico.bolelli@unimore.it>
+// 2021 Stefano Allegretti <stefano.allegretti@unimore.it>
+// 2021 Costantino Grana <costantino.grana@unimore.it>
+//
+// This file has been automatically generated using GRAPHGEN (https://github.com/prittt/GRAPHGEN)
+// and taken from the YACCLAB repository (https://github.com/prittt/YACCLAB).
+sl_tree_0: if ((c+=2) >= w - 2) { if (c > w - 2) { goto sl_break_0_0; } else { goto sl_break_1_0; } }
+        if (CONDITION_O) {
+            if (CONDITION_P) {
+                ACTION_2
+                goto sl_tree_1;
+            }
+            else {
+                ACTION_2
+                goto sl_tree_0;
+            }
+        }
+        else {
+            NODE_372:
+            if (CONDITION_P) {
+                ACTION_2
+                goto sl_tree_1;
+            }
+            else {
+                ACTION_1
+                goto sl_tree_0;
+            }
+        }
+sl_tree_1: if ((c+=2) >= w - 2) { if (c > w - 2) { goto sl_break_0_1; } else { goto sl_break_1_1; } }
+        if (CONDITION_O) {
+            if (CONDITION_P) {
+                ACTION_6
+                goto sl_tree_1;
+            }
+            else {
+                ACTION_6
+                goto sl_tree_0;
+            }
+        }
+        else{
+            goto NODE_372;
+        }
+sl_break_0_0:
+        if (CONDITION_O) {
+            ACTION_2
+        }
+        else {
+            ACTION_1
+        }
+    goto end_sl;
+sl_break_0_1:
+        if (CONDITION_O) {
+            ACTION_6
+        }
+        else {
+            ACTION_1
+        }
+    goto end_sl;
+sl_break_1_0:
+        if (CONDITION_O) {
+            if (CONDITION_P) {
+                ACTION_2
+            }
+            else {
+                ACTION_2
+            }
+        }
+        else {
+            NODE_375:
+            if (CONDITION_P) {
+                ACTION_2
+            }
+            else {
+                ACTION_1
+            }
+        }
+    goto end_sl;
+sl_break_1_1:
+        if (CONDITION_O) {
+            if (CONDITION_P) {
+                ACTION_6
+            }
+            else {
+                ACTION_6
+            }
+        }
+        else{
+            goto NODE_375;
+        }
+    goto end_sl;
+end_sl:;
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 45658ef561..677b6a0738 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -162,7 +162,9 @@ namespace
 
             // calc histogram
 
-            int tileHist[histSize] = {0, };
+            cv::AutoBuffer<int> _tileHist(histSize);
+            int* tileHist = _tileHist.data();
+            std::fill(tileHist, tileHist + histSize, 0);
 
             int height = tileROI.height;
             const size_t sstep = src_.step / sizeof(T);
diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp
index e4a5052892..3f3b634790 100644
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@@ -104,6 +104,7 @@ inline int dstChannels(int code)
             return 4;
 
         case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR:
+        case COLOR_YUV2RGB: case COLOR_YUV2BGR: case COLOR_RGB2YUV: case COLOR_BGR2YUV:
         case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
         case COLOR_GRAY2BGR:
         case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
diff --git a/modules/imgproc/src/connectedcomponents.cpp b/modules/imgproc/src/connectedcomponents.cpp
index c3a3f6f75b..e66766c872 100644
--- a/modules/imgproc/src/connectedcomponents.cpp
+++ b/modules/imgproc/src/connectedcomponents.cpp
@@ -38,11 +38,12 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 // 2011 Jason Newton <nevion@gmail.com>
-// 2016 Costantino Grama <costantino.grana@unimore.it>
-// 2016 Federico Bolelli <federico.bolelli@hotmail.com>
+// 2016, 2021 Costantino Grana <costantino.grana@unimore.it>
+// 2016, 2021 Federico Bolelli <federico.bolelli@unimore.it>
 // 2016 Lorenzo Baraldi <lorenzo.baraldi@unimore.it>
 // 2016 Roberto Vezzani <roberto.vezzani@unimore.it>
 // 2016 Michele Cancilla <cancilla.michele@gmail.com>
+// 2021 Stefano Allegretti <stefano.allegretti@unimore.it>
 //M*/
 //
 #include "precomp.hpp"
@@ -286,10 +287,366 @@ namespace cv{
         return LT((y /*+ 1*/) / 2) * LT((w + 1) / 2) + 1;
     }
 
+    //Implementation of Spaghetti algorithm, as described in "Spaghetti Labeling: Directed Acyclic Graphs for Block-Based
+    //Connected Components Labeling" (only for 8-connectivity)
+    //Federico Bolelli et. al.
+    template<typename LabelT, typename PixelT, typename StatsOp = NoOp >
+    struct LabelingBolelli
+    {
+        LabelT operator()(const cv::Mat& img, cv::Mat& imgLabels, int connectivity, StatsOp& sop)
+        {
+            CV_Assert(img.rows == imgLabels.rows);
+            CV_Assert(img.cols == imgLabels.cols);
+            CV_Assert(connectivity == 8);
 
-    //Based on "Two Strategies to Speed up Connected Components Algorithms", the SAUF (Scan array union find) variant
-        //using decision trees
-        //Kesheng Wu, et al
+            const int h = img.rows;
+            const int w = img.cols;
+
+            const int e_rows = h & -2;
+            const bool o_rows = h % 2 == 1;
+            const int e_cols = w & -2;
+            const bool o_cols = w % 2 == 1;
+
+            // A quick and dirty upper bound for the maximum number of labels.
+            // Following formula comes from the fact that a 2x2 block in 8-connectivity case
+            // can never have more than 1 new label and 1 label for background.
+            // Worst case image example pattern:
+            // 1 0 1 0 1...
+            // 0 0 0 0 0...
+            // 1 0 1 0 1...
+            // ............
+            const size_t Plength = size_t(((h + 1) / 2) * size_t((w + 1) / 2)) + 1;
+
+            std::vector<LabelT> P_(Plength, 0);
+            LabelT *P = P_.data();
+            //P[0] = 0;
+            LabelT lunique = 1;
+
+            // First scan
+
+            // We work with 2x2 blocks
+            // +-+-+-+
+            // |P|Q|R|
+            // +-+-+-+
+            // |S|X|
+            // +-+-+
+
+            // The pixels are named as follows
+            // +---+---+---+
+            // |a b|c d|e f|
+            // |g h|i j|k l|
+            // +---+---+---+
+            // |m n|o p|
+            // |q r|s t|
+            // +---+---+
+
+            // Pixels a, f, l, q are not needed, since we need to understand the
+            // the connectivity between these blocks and those pixels only matter
+            // when considering the outer connectivities
+
+            // A bunch of defines is used to check if the pixels are foreground
+            // and to define actions to be performed on blocks
+            {
+                #define CONDITION_B img_row_prev_prev[c-1]>0
+                #define CONDITION_C img_row_prev_prev[c]>0
+                #define CONDITION_D img_row_prev_prev[c+1]>0
+                #define CONDITION_E img_row_prev_prev[c+2]>0
+
+                #define CONDITION_G img_row_prev[c-2]>0
+                #define CONDITION_H img_row_prev[c-1]>0
+                #define CONDITION_I img_row_prev[c]>0
+                #define CONDITION_J img_row_prev[c+1]>0
+                #define CONDITION_K img_row_prev[c+2]>0
+
+                #define CONDITION_M img_row[c-2]>0
+                #define CONDITION_N img_row[c-1]>0
+                #define CONDITION_O img_row[c]>0
+                #define CONDITION_P img_row[c+1]>0
+
+                #define CONDITION_R img_row_fol[c-1]>0
+                #define CONDITION_S img_row_fol[c]>0
+                #define CONDITION_T img_row_fol[c+1]>0
+
+                // Action 1: No action
+                #define ACTION_1 img_labels_row[c] = 0;
+                // Action 2: New label (the block has foreground pixels and is not connected to anything else)
+                #define ACTION_2 img_labels_row[c] = lunique; \
+                                 P[lunique] = lunique;        \
+                                 lunique = lunique + 1;
+                //Action 3: Assign label of block P
+                #define ACTION_3 img_labels_row[c] = img_labels_row_prev_prev[c - 2];
+                // Action 4: Assign label of block Q
+                #define ACTION_4 img_labels_row[c] = img_labels_row_prev_prev[c];
+                // Action 5: Assign label of block R
+                #define ACTION_5 img_labels_row[c] = img_labels_row_prev_prev[c + 2];
+                // Action 6: Assign label of block S
+                #define ACTION_6 img_labels_row[c] = img_labels_row[c - 2];
+                // Action 7: Merge labels of block P and Q
+                #define ACTION_7 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row_prev_prev[c]);
+                //Action 8: Merge labels of block P and R
+                #define ACTION_8 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row_prev_prev[c + 2]);
+                // Action 9 Merge labels of block P and S
+                #define ACTION_9 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row[c - 2]);
+                // Action 10 Merge labels of block Q and R
+                #define ACTION_10 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c], img_labels_row_prev_prev[c + 2]);
+                // Action 11: Merge labels of block Q and S
+                #define ACTION_11 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c], img_labels_row[c - 2]);
+                // Action 12: Merge labels of block R and S
+                #define ACTION_12 img_labels_row[c] = set_union(P, img_labels_row_prev_prev[c + 2], img_labels_row[c - 2]);
+                // Action 13: Merge labels of block P, Q and R
+                #define ACTION_13 img_labels_row[c] = set_union(P, set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row_prev_prev[c]), img_labels_row_prev_prev[c + 2]);
+                // Action 14: Merge labels of block P, Q and S
+                #define ACTION_14 img_labels_row[c] = set_union(P, set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row_prev_prev[c]), img_labels_row[c - 2]);
+                //Action 15: Merge labels of block P, R and S
+                #define ACTION_15 img_labels_row[c] = set_union(P, set_union(P, img_labels_row_prev_prev[c - 2], img_labels_row_prev_prev[c + 2]), img_labels_row[c - 2]);
+                //Action 16: labels of block Q, R and S
+                #define ACTION_16 img_labels_row[c] = set_union(P, set_union(P, img_labels_row_prev_prev[c], img_labels_row_prev_prev[c + 2]), img_labels_row[c - 2]);
+            }
+            // The following Directed Rooted Acyclic Graphs (DAGs) allow to choose which action to
+            // perform, checking as few conditions as possible. Special DAGs are used for the first/last
+            // line of the image and for single line images. Actions: the blocks label are provisionally
+            // stored in the top left pixel of the block in the labels image.
+            if (h == 1) {
+                // Single line
+                const PixelT * const img_row = img.ptr<PixelT>(0);
+                LabelT * const img_labels_row = imgLabels.ptr<LabelT>(0);
+                int c = -2;
+                #include "ccl_bolelli_forest_singleline.inc.hpp"
+            }
+            else {
+                // More than one line
+
+                // First couple of lines
+                {
+                    const PixelT * const img_row = img.ptr<PixelT>(0);
+                    const PixelT * const img_row_fol = (PixelT *)(((char*)img_row) + img.step.p[0]);
+                    LabelT * const img_labels_row = imgLabels.ptr<LabelT>(0);
+                    int c = -2;
+                    #include "ccl_bolelli_forest_firstline.inc.hpp"
+                }
+
+                // Every other line but the last one if image has an odd number of rows
+                for (int r = 2; r < e_rows; r += 2) {
+                    // Get rows pointer
+                    const PixelT * const img_row = img.ptr<PixelT>(r);
+                    const PixelT * const img_row_prev = (PixelT *)(((char*)img_row) - img.step.p[0]);
+                    const PixelT * const img_row_prev_prev = (PixelT *)(((char*)img_row_prev) - img.step.p[0]);
+                    const PixelT * const img_row_fol = (PixelT *)(((char*)img_row) + img.step.p[0]);
+                    LabelT * const img_labels_row = imgLabels.ptr<LabelT>(r);
+                    LabelT * const img_labels_row_prev_prev = (LabelT *)(((char*)img_labels_row) - imgLabels.step.p[0] - imgLabels.step.p[0]);
+
+                    int c = -2;
+                    goto tree_0;
+
+                    #include "ccl_bolelli_forest.inc.hpp"
+                }
+
+                // Last line (in case the rows are odd)
+                if (o_rows) {
+                    int r = h - 1;
+                    const PixelT * const img_row = img.ptr<PixelT>(r);
+                    const PixelT * const img_row_prev = (PixelT *)(((char*)img_row) - img.step.p[0]);
+                    const PixelT * const img_row_prev_prev = (PixelT *)(((char*)img_row_prev) - img.step.p[0]);
+                    LabelT * const img_labels_row = imgLabels.ptr<LabelT>(r);
+                    LabelT * const img_labels_row_prev_prev = (LabelT *)(((char*)img_labels_row) - imgLabels.step.p[0] - imgLabels.step.p[0]);
+                    int c = -2;
+                    #include "ccl_bolelli_forest_lastline.inc.hpp"
+                }
+            }
+
+            // undef conditions and actions
+            {
+                #undef ACTION_1
+                #undef ACTION_2
+                #undef ACTION_3
+                #undef ACTION_4
+                #undef ACTION_5
+                #undef ACTION_6
+                #undef ACTION_7
+                #undef ACTION_8
+                #undef ACTION_9
+                #undef ACTION_10
+                #undef ACTION_11
+                #undef ACTION_12
+                #undef ACTION_13
+                #undef ACTION_14
+                #undef ACTION_15
+                #undef ACTION_16
+
+                #undef CONDITION_B
+                #undef CONDITION_C
+                #undef CONDITION_D
+                #undef CONDITION_E
+
+                #undef CONDITION_G
+                #undef CONDITION_H
+                #undef CONDITION_I
+                #undef CONDITION_J
+                #undef CONDITION_K
+
+                #undef CONDITION_M
+                #undef CONDITION_N
+                #undef CONDITION_O
+                #undef CONDITION_P
+
+                #undef CONDITION_R
+                #undef CONDITION_S
+                #undef CONDITION_T
+            }
+
+            // Second scan + analysis
+            LabelT nLabels = flattenL(P, lunique);
+            sop.init(nLabels);
+
+            int r = 0;
+            for (; r < e_rows; r += 2) {
+                // Get rows pointer
+                const PixelT * const img_row = img.ptr<PixelT>(r);
+                const PixelT * const img_row_fol = (PixelT *)(((char*)img_row) + img.step.p[0]);
+                LabelT * const img_labels_row = imgLabels.ptr<LabelT>(r);
+                LabelT * const img_labels_row_fol = (LabelT *)(((char*)img_labels_row) + imgLabels.step.p[0]);
+                int c = 0;
+                for (; c < e_cols; c += 2) {
+                    LabelT iLabel = img_labels_row[c];
+                    if (iLabel > 0) {
+                        iLabel = P[iLabel];
+                        if (img_row[c] > 0) {
+                            img_labels_row[c] = iLabel;
+                            sop(r, c, iLabel);
+                        }
+                        else {
+                            img_labels_row[c] = 0;
+                            sop(r, c, 0);
+                        }
+                        if (img_row[c + 1] > 0) {
+                            img_labels_row[c + 1] = iLabel;
+                            sop(r, c + 1, iLabel);
+                        }
+                        else {
+                            img_labels_row[c + 1] = 0;
+                            sop(r, c + 1, 0);
+                        }
+                        if (img_row_fol[c] > 0) {
+                            img_labels_row_fol[c] = iLabel;
+                            sop(r + 1, c, iLabel);
+                        }
+                        else {
+                            img_labels_row_fol[c] = 0;
+                            sop(r + 1, c, 0);
+                        }
+                        if (img_row_fol[c + 1] > 0) {
+                            img_labels_row_fol[c + 1] = iLabel;
+                            sop(r + 1, c + 1, iLabel);
+                        }
+                        else {
+                            img_labels_row_fol[c + 1] = 0;
+                            sop(r + 1, c + 1, 0);
+                        }
+                    }
+                    else {
+                        img_labels_row[c] = 0;
+                        sop(r, c, 0);
+                        img_labels_row[c + 1] = 0;
+                        sop(r, c + 1, 0);
+                        img_labels_row_fol[c] = 0;
+                        sop(r + 1, c, 0);
+                        img_labels_row_fol[c + 1] = 0;
+                        sop(r + 1, c + 1, 0);
+                    }
+                }
+                // Last column if the number of columns is odd
+                if (o_cols) {
+                    LabelT iLabel = img_labels_row[c];
+                    if (iLabel > 0) {
+                        iLabel = P[iLabel];
+                        if (img_row[c] > 0) {
+                            img_labels_row[c] = iLabel;
+                            sop(r, c, iLabel);
+                        }
+                        else {
+                            img_labels_row[c] = 0;
+                            sop(r, c, 0);
+                        }
+                        if (img_row_fol[c] > 0) {
+                            img_labels_row_fol[c] = iLabel;
+                            sop(r + 1, c, iLabel);
+                        }
+                        else {
+                            img_labels_row_fol[c] = 0;
+                            sop(r + 1, c, 0);
+                        }
+                    }
+                    else {
+                        img_labels_row[c] = 0;
+                        sop(r, c, 0);
+                        img_labels_row_fol[c] = 0;
+                        sop(r + 1, c, 0);
+                    }
+                }
+            }
+            // Last row if the number of rows is odd
+            if (o_rows) {
+                // Get rows pointer
+                const PixelT * const img_row = img.ptr<PixelT>(r);
+                LabelT * const img_labels_row = imgLabels.ptr<LabelT>(r);
+                int c = 0;
+                for (; c < e_cols; c += 2) {
+                    LabelT iLabel = img_labels_row[c];
+                    if (iLabel > 0) {
+                        iLabel = P[iLabel];
+                        if (img_row[c] > 0) {
+                            img_labels_row[c] = iLabel;
+                            sop(r, c, iLabel);
+                        }
+                        else {
+                            img_labels_row[c] = 0;
+                            sop(r, c, 0);
+                        }
+                        if (img_row[c + 1] > 0) {
+                            img_labels_row[c + 1] = iLabel;
+                            sop(r, c + 1, iLabel);
+                        }
+                        else {
+                            img_labels_row[c + 1] = 0;
+                            sop(r, c + 1, 0);
+                        }
+                    }
+                    else {
+                        img_labels_row[c] = 0;
+                        sop(r, c, 0);
+                        img_labels_row[c + 1] = 0;
+                        sop(r, c + 1, 0);
+                    }
+                }
+                // Last column if the number of columns is odd
+                if (o_cols) {
+                    LabelT iLabel = img_labels_row[c];
+                    if (iLabel > 0) {
+                        iLabel = P[iLabel];
+                        if (img_row[c] > 0) {
+                            img_labels_row[c] = iLabel;
+                            sop(r, c, iLabel);
+                        }
+                        else {
+                            img_labels_row[c] = 0;
+                            sop(r, c, 0);
+                        }
+                    }
+                    else {
+                        img_labels_row[c] = 0;
+                        sop(r, c, iLabel);
+                    }
+                }
+            }
+
+            sop.finish();
+            return nLabels;
+        }//End function LabelingBolelli operator()
+    };//End struct LabelingBolelli
+
+    //Parallel implementation of Scan Array-based Union Find (SAUF) algorithm, as described in "Two More Strategies to Speed
+    //Up Connected Components Labeling Algorithms"
+    //Federico Bolelli et. al.
     template<typename LabelT, typename PixelT, typename StatsOp = NoOp >
     struct LabelingWuParallel{
 
@@ -332,11 +689,11 @@ namespace cv{
                     LabelT * const imgLabels_row_prev = (LabelT *)(((char *)imgLabels_row) - imgLabels_.step.p[0]);
                     for (int c = 0; c < w; ++c) {
 
-#define condition_p c > 0 && r > limitLine && img_row_prev[c - 1] > 0
-#define condition_q r > limitLine && img_row_prev[c] > 0
-#define condition_r c < w - 1 && r > limitLine && img_row_prev[c + 1] > 0
-#define condition_s c > 0 && img_row[c - 1] > 0
-#define condition_x img_row[c] > 0
+                        #define condition_p c > 0 && r > limitLine && img_row_prev[c - 1] > 0
+                        #define condition_q r > limitLine && img_row_prev[c] > 0
+                        #define condition_r c < w - 1 && r > limitLine && img_row_prev[c + 1] > 0
+                        #define condition_s c > 0 && img_row[c - 1] > 0
+                        #define condition_x img_row[c] > 0
 
                         if (condition_x){
                             if (condition_q){
@@ -390,11 +747,11 @@ namespace cv{
                 //write in the follower memory location
                 chunksSizeAndLabels_[startR + 1] = label - firstLabel;
             }
-#undef condition_p
-#undef condition_q
-#undef condition_r
-#undef condition_s
-#undef condition_x
+            #undef condition_p
+            #undef condition_q
+            #undef condition_r
+            #undef condition_s
+            #undef condition_x
         };
 
         class FirstScan4Connectivity : public cv::ParallelLoopBody{
@@ -435,9 +792,9 @@ namespace cv{
                     LabelT * const imgLabels_row_prev = (LabelT *)(((char *)imgLabels_row) - imgLabels_.step.p[0]);
                     for (int c = 0; c < w; ++c) {
 
-#define condition_q r > limitLine && img_row_prev[c] > 0
-#define condition_s c > 0 && img_row[c - 1] > 0
-#define condition_x img_row[c] > 0
+                        #define condition_q r > limitLine && img_row_prev[c] > 0
+                        #define condition_s c > 0 && img_row[c - 1] > 0
+                        #define condition_x img_row[c] > 0
 
                         if (condition_x){
                             if (condition_q){
@@ -471,9 +828,9 @@ namespace cv{
                 //write in the following memory location
                 chunksSizeAndLabels_[startR + 1] = label - firstLabel;
             }
-#undef condition_q
-#undef condition_s
-#undef condition_x
+            #undef condition_q
+            #undef condition_s
+            #undef condition_x
         };
 
         class SecondScan : public cv::ParallelLoopBody{
@@ -541,10 +898,10 @@ namespace cv{
 
                 for (int c = 0; c < w; ++c){
 
-#define condition_p c > 0 && imgLabels_row_prev[c - 1] > 0
-#define condition_q imgLabels_row_prev[c] > 0
-#define condition_r c < w - 1 && imgLabels_row_prev[c + 1] > 0
-#define condition_x imgLabels_row[c] > 0
+                    #define condition_p c > 0 && imgLabels_row_prev[c - 1] > 0
+                    #define condition_q imgLabels_row_prev[c] > 0
+                    #define condition_r c < w - 1 && imgLabels_row_prev[c + 1] > 0
+                    #define condition_x imgLabels_row[c] > 0
 
                     if (condition_x){
                         if (condition_p){
@@ -562,10 +919,10 @@ namespace cv{
                     }
                 }
             }
-#undef condition_p
-#undef condition_q
-#undef condition_r
-#undef condition_x
+            #undef condition_p
+            #undef condition_q
+            #undef condition_r
+            #undef condition_x
         }
 
         inline static
@@ -586,8 +943,8 @@ namespace cv{
 
                 for (int c = 0; c < w; ++c){
 
-#define condition_q imgLabels_row_prev[c] > 0
-#define condition_x imgLabels_row[c] > 0
+                    #define condition_q imgLabels_row_prev[c] > 0
+                    #define condition_x imgLabels_row[c] > 0
 
                     if (condition_x){
                         if (condition_q){
@@ -597,8 +954,8 @@ namespace cv{
                     }
                 }
             }
-#undef condition_q
-#undef condition_x
+            #undef condition_q
+            #undef condition_x
         }
 
         LabelT operator()(const cv::Mat& img, cv::Mat& imgLabels, int connectivity, StatsOp& sop){
@@ -671,10 +1028,9 @@ namespace cv{
         }
     };//End struct LabelingWuParallel
 
-
-    //Based on "Two Strategies to Speed up Connected Components Algorithms", the SAUF (Scan array union find) variant
+    //Based on "Two Strategies to Speed up Connected Components Algorithms", the SAUF (Scan Array-based Union Find) variant
     //using decision trees
-    //Kesheng Wu, et al
+    //Kesheng Wu et. al.
     template<typename LabelT, typename PixelT, typename StatsOp = NoOp >
     struct LabelingWu{
         LabelT operator()(const cv::Mat& img, cv::Mat& imgLabels, int connectivity, StatsOp& sop){
@@ -712,11 +1068,11 @@ namespace cv{
 
                     for (int c = 0; c < w; ++c){
 
-#define condition_p c>0 && r>0 && img_row_prev[c - 1]>0
-#define condition_q r>0 && img_row_prev[c]>0
-#define condition_r c < w - 1 && r > 0 && img_row_prev[c + 1] > 0
-#define condition_s c > 0 && img_row[c - 1] > 0
-#define condition_x img_row[c] > 0
+                        #define condition_p c>0 && r>0 && img_row_prev[c - 1]>0
+                        #define condition_q r>0 && img_row_prev[c]>0
+                        #define condition_r c < w - 1 && r > 0 && img_row_prev[c + 1] > 0
+                        #define condition_s c > 0 && img_row[c - 1] > 0
+                        #define condition_x img_row[c] > 0
 
                         if (condition_x){
                             if (condition_q){
@@ -770,11 +1126,11 @@ namespace cv{
                         }
                     }
                 }
-#undef condition_p
-#undef condition_q
-#undef condition_r
-#undef condition_s
-#undef condition_x
+                #undef condition_p
+                #undef condition_q
+                #undef condition_r
+                #undef condition_s
+                #undef condition_x
             }
             else{
                 for (int r = 0; r < h; ++r){
@@ -784,9 +1140,9 @@ namespace cv{
                     LabelT * const imgLabels_row_prev = (LabelT *)(((char *)imgLabels_row) - imgLabels.step.p[0]);
                     for (int c = 0; c < w; ++c) {
 
-#define condition_q r > 0 && img_row_prev[c] > 0
-#define condition_s c > 0 && img_row[c - 1] > 0
-#define condition_x img_row[c] > 0
+                        #define condition_q r > 0 && img_row_prev[c] > 0
+                        #define condition_s c > 0 && img_row[c - 1] > 0
+                        #define condition_x img_row[c] > 0
 
                         if (condition_x){
                             if (condition_q){
@@ -818,9 +1174,9 @@ namespace cv{
                         }
                     }
                 }
-#undef condition_q
-#undef condition_s
-#undef condition_x
+                #undef condition_q
+                #undef condition_s
+                #undef condition_x
             }
 
             //analysis
@@ -842,9 +1198,9 @@ namespace cv{
         }//End function LabelingWu operator()
     };//End struct LabelingWu
 
-
-    // Based on "Optimized  Block-based Connected Components Labeling with Decision Trees", Costantino Grana et al
-    // Only for 8-connectivity
+    //Parallel implementation of BBDT (Block-Based with Decision Tree) algorithm, as described in "Two More Strategies to Speed
+    //Up Connected Components Labeling Algorithms"
+    //Federico Bolelli et. al.
     template<typename LabelT, typename PixelT, typename StatsOp = NoOp >
     struct LabelingGranaParallel{
 
@@ -901,31 +1257,31 @@ namespace cv{
                         // +---+---+
 
                         // Pixels a, f, l, q are not needed, since we need to understand the
-                        // the connectivity between these blocks and those pixels only metter
+                        // the connectivity between these blocks and those pixels only matter
                         // when considering the outer connectivities
 
                         // A bunch of defines used to check if the pixels are foreground,
                         // without going outside the image limits.
 
-#define condition_b c-1>=0 && r > limitLine && img_row_prev_prev[c-1]>0
-#define condition_c r > limitLine && img_row_prev_prev[c]>0
-#define condition_d c+1<w && r > limitLine && img_row_prev_prev[c+1]>0
-#define condition_e c+2<w && r > limitLine && img_row_prev_prev[c+2]>0
+                        #define condition_b c-1>=0 && r > limitLine && img_row_prev_prev[c-1]>0
+                        #define condition_c r > limitLine && img_row_prev_prev[c]>0
+                        #define condition_d c+1<w && r > limitLine && img_row_prev_prev[c+1]>0
+                        #define condition_e c+2<w && r > limitLine && img_row_prev_prev[c+2]>0
 
-#define condition_g c-2>=0 && r > limitLine - 1 && img_row_prev[c-2]>0
-#define condition_h c-1>=0 && r > limitLine - 1 && img_row_prev[c-1]>0
-#define condition_i r > limitLine - 1 && img_row_prev[c]>0
-#define condition_j c+1<w && r > limitLine - 1 && img_row_prev[c+1]>0
-#define condition_k c+2<w && r > limitLine - 1 && img_row_prev[c+2]>0
+                        #define condition_g c-2>=0 && r > limitLine - 1 && img_row_prev[c-2]>0
+                        #define condition_h c-1>=0 && r > limitLine - 1 && img_row_prev[c-1]>0
+                        #define condition_i r > limitLine - 1 && img_row_prev[c]>0
+                        #define condition_j c+1<w && r > limitLine - 1 && img_row_prev[c+1]>0
+                        #define condition_k c+2<w && r > limitLine - 1 && img_row_prev[c+2]>0
 
-#define condition_m c-2>=0 && img_row[c-2]>0
-#define condition_n c-1>=0 && img_row[c-1]>0
-#define condition_o img_row[c]>0
-#define condition_p c+1<w && img_row[c+1]>0
+                        #define condition_m c-2>=0 && img_row[c-2]>0
+                        #define condition_n c-1>=0 && img_row[c-1]>0
+                        #define condition_o img_row[c]>0
+                        #define condition_p c+1<w && img_row[c+1]>0
 
-#define condition_r c-1>=0 && r+1<h && img_row_fol[c-1]>0
-#define condition_s r+1<h && img_row_fol[c]>0
-#define condition_t c+1<w && r+1<h && img_row_fol[c+1]>0
+                        #define condition_r c-1>=0 && r+1<h && img_row_fol[c-1]>0
+                        #define condition_s r+1<h && img_row_fol[c]>0
+                        #define condition_t c+1<w && r+1<h && img_row_fol[c+1]>0
 
                         // This is a decision tree which allows to choose which action to
                         // perform, checking as few conditions as possible.
@@ -1903,15 +2259,15 @@ namespace cv{
                 //write in the follower memory location
                 chunksSizeAndLabels_[startR + 1] = label - firstLabel;
             }
-#undef condition_k
-#undef condition_j
-#undef condition_i
-#undef condition_h
-#undef condition_g
-#undef condition_e
-#undef condition_d
-#undef condition_c
-#undef condition_b
+            #undef condition_k
+            #undef condition_j
+            #undef condition_i
+            #undef condition_h
+            #undef condition_g
+            #undef condition_e
+            #undef condition_d
+            #undef condition_c
+            #undef condition_b
         };
 
         class SecondScan : public cv::ParallelLoopBody{
@@ -2511,12 +2867,12 @@ namespace cv{
 
                     for (int c = 0; c < w; c += 2){
 
-#define condition_x imgLabels_row[c] > 0
-#define condition_pppr c > 1 && imgLabels_row_prev_prev[c - 2] > 0
-#define condition_qppr imgLabels_row_prev_prev[c] > 0
-#define condition_qppr1 c < w - 1
-#define condition_qppr2 c < w
-#define condition_rppr c < w - 2 && imgLabels_row_prev_prev[c + 2] > 0
+                        #define condition_x imgLabels_row[c] > 0
+                        #define condition_pppr c > 1 && imgLabels_row_prev_prev[c - 2] > 0
+                        #define condition_qppr imgLabels_row_prev_prev[c] > 0
+                        #define condition_qppr1 c < w - 1
+                        #define condition_qppr2 c < w
+                        #define condition_rppr c < w - 2 && imgLabels_row_prev_prev[c + 2] > 0
 
                         if (condition_x){
                             if (condition_pppr){
@@ -2603,8 +2959,9 @@ namespace cv{
         }
     };//End struct LabelingGranaParallel
 
-    // Based on "Optimized  Block-based Connected Components Labeling with Decision Trees", Costantino Grana et al
-    // Only for 8-connectivity
+    //Implementation of BBDT (Block-Based with Decision Tree) algorithm, as described in "Optimized Block-based Connected
+    //Components Labeling with Decision Trees" (only for 8-connectivity)
+    //Costantino Grana et. al.
     template<typename LabelT, typename PixelT, typename StatsOp = NoOp >
     struct LabelingGrana{
         LabelT operator()(const cv::Mat& img, cv::Mat& imgLabels, int connectivity, StatsOp& sop){
@@ -2658,30 +3015,30 @@ namespace cv{
                     // +---+---+
 
                     // Pixels a, f, l, q are not needed, since we need to understand the
-                    // the connectivity between these blocks and those pixels only metter
+                    // the connectivity between these blocks and those pixels only matter
                     // when considering the outer connectivities
 
                     // A bunch of defines used to check if the pixels are foreground,
                     // without going outside the image limits.
-#define condition_b c-1>=0 && r-2>=0 && img_row_prev_prev[c-1]>0
-#define condition_c r-2>=0 && img_row_prev_prev[c]>0
-#define condition_d c+1<w&& r-2>=0 && img_row_prev_prev[c+1]>0
-#define condition_e c+2<w  && r-1>=0 && img_row_prev[c-1]>0
+                    #define condition_b c-1>=0 && r-2>=0 && img_row_prev_prev[c-1]>0
+                    #define condition_c r-2>=0 && img_row_prev_prev[c]>0
+                    #define condition_d c+1<w&& r-2>=0 && img_row_prev_prev[c+1]>0
+                    #define condition_e c+2<w  && r-1>=0 && img_row_prev[c-1]>0
 
-#define condition_g c-2>=0 && r-1>=0 && img_row_prev[c-2]>0
-#define condition_h c-1>=0 && r-1>=0 && img_row_prev[c-1]>0
-#define condition_i r-1>=0 && img_row_prev[c]>0
-#define condition_j c+1<w && r-1>=0 && img_row_prev[c+1]>0
-#define condition_k c+2<w && r-1>=0 && img_row_prev[c+2]>0
+                    #define condition_g c-2>=0 && r-1>=0 && img_row_prev[c-2]>0
+                    #define condition_h c-1>=0 && r-1>=0 && img_row_prev[c-1]>0
+                    #define condition_i r-1>=0 && img_row_prev[c]>0
+                    #define condition_j c+1<w && r-1>=0 && img_row_prev[c+1]>0
+                    #define condition_k c+2<w && r-1>=0 && img_row_prev[c+2]>0
 
-#define condition_m c-2>=0 && img_row[c-2]>0
-#define condition_n c-1>=0 && img_row[c-1]>0
-#define condition_o img_row[c]>0
-#define condition_p c+1<w && img_row[c+1]>0
+                    #define condition_m c-2>=0 && img_row[c-2]>0
+                    #define condition_n c-1>=0 && img_row[c-1]>0
+                    #define condition_o img_row[c]>0
+                    #define condition_p c+1<w && img_row[c+1]>0
 
-#define condition_r c-1>=0 && r+1<h && img_row_fol[c-1]>0
-#define condition_s r+1<h && img_row_fol[c]>0
-#define condition_t c+1<w && r+1<h && img_row_fol[c+1]>0
+                    #define condition_r c-1>=0 && r+1<h && img_row_fol[c-1]>0
+                    #define condition_s r+1<h && img_row_fol[c]>0
+                    #define condition_t c+1<w && r+1<h && img_row_fol[c+1]>0
 
                     // This is a decision tree which allows to choose which action to
                     // perform, checking as few conditions as possible.
@@ -3948,7 +4305,7 @@ namespace cv{
     int connectedComponents_sub1(const cv::Mat& I, cv::Mat& L, int connectivity, int ccltype, StatsOp& sop){
         CV_Assert(L.channels() == 1 && I.channels() == 1);
         CV_Assert(connectivity == 8 || connectivity == 4);
-        CV_Assert(ccltype == CCL_GRANA || ccltype == CCL_WU || ccltype == CCL_DEFAULT);
+        CV_Assert(ccltype == CCL_SPAGHETTI || ccltype == CCL_BBDT || ccltype == CCL_SAUF || ccltype == CCL_BOLELLI || ccltype == CCL_GRANA || ccltype == CCL_WU || ccltype == CCL_DEFAULT);
 
         int lDepth = L.depth();
         int iDepth = I.depth();
@@ -3960,8 +4317,8 @@ namespace cv{
         //Run parallel labeling only if the rows of the image are at least twice the number of available threads
         const bool is_parallel = currentParallelFramework != NULL && nThreads > 1 && L.rows / nThreads >= 2;
 
-        if (ccltype == CCL_WU || connectivity == 4){
-            // Wu algorithm is used
+        if (ccltype == CCL_SAUF || ccltype == CCL_WU || connectivity == 4){
+            // SAUF algorithm is used
             using connectedcomponents::LabelingWu;
             using connectedcomponents::LabelingWuParallel;
             //warn if L's depth is not sufficient?
@@ -3980,8 +4337,8 @@ namespace cv{
                     return (int)LabelingWuParallel<int, uchar, StatsOp>()(I, L, connectivity, sop);
             }
         }
-        else if ((ccltype == CCL_GRANA || ccltype == CCL_DEFAULT) && connectivity == 8){
-            // Grana algorithm is used
+        else if ((ccltype == CCL_BBDT || ccltype == CCL_GRANA || ccltype == CCL_DEFAULT) && connectivity == 8){
+            // BBDT algorithm is used
             using connectedcomponents::LabelingGrana;
             using connectedcomponents::LabelingGranaParallel;
             //warn if L's depth is not sufficient?
@@ -4000,6 +4357,23 @@ namespace cv{
                     return (int)LabelingGranaParallel<int, uchar, StatsOp>()(I, L, connectivity, sop);
             }
         }
+        else if ((ccltype == CCL_SPAGHETTI || ccltype == CCL_BOLELLI) && connectivity == 8) {
+            // Spaghetti algorithm is used
+            using connectedcomponents::LabelingBolelli;
+            //using connectedcomponents::LabelingBolelliParallel; // Not implemented
+            //warn if L's depth is not sufficient?
+            if (lDepth == CV_8U) {
+                //Not supported yet
+            }
+            else if (lDepth == CV_16U) {
+                return (int)LabelingBolelli<ushort, uchar, StatsOp>()(I, L, connectivity, sop);
+            }
+            else if (lDepth == CV_32S) {
+                //note that signed types don't really make sense here and not being able to use unsigned matters for scientific projects
+                //OpenCV: how should we proceed?  .at<T> typechecks in debug mode
+                return (int)LabelingBolelli<int, uchar, StatsOp>()(I, L, connectivity, sop);
+            }
+        }
 
         CV_Error(CV_StsUnsupportedFormat, "unsupported label/image type");
     }
diff --git a/modules/imgproc/src/featureselect.cpp b/modules/imgproc/src/featureselect.cpp
index cc3b9c0680..9647d7e68c 100644
--- a/modules/imgproc/src/featureselect.cpp
+++ b/modules/imgproc/src/featureselect.cpp
@@ -74,8 +74,8 @@ struct Corner
 
 static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                                      int maxCorners, double qualityLevel, double minDistance,
-                                     InputArray _mask, int blockSize, int gradientSize,
-                                     bool useHarrisDetector, double harrisK )
+                                     InputArray _mask, OutputArray _cornersQuality, int blockSize, int gradientSize,
+                                     bool useHarrisDetector, double harrisK)
 {
     UMat eig, maxEigenValue;
     if( useHarrisDetector )
@@ -176,7 +176,9 @@ static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     std::sort(corner_ptr, corner_ptr + total);
 
     std::vector<Point2f> corners;
+    std::vector<float> cornersQuality;
     corners.reserve(total);
+    cornersQuality.reserve(total);
 
     if (minDistance >= 1)
     {
@@ -237,6 +239,7 @@ static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                 grid[y_cell*grid_width + x_cell].push_back(Point2f((float)c.x, (float)c.y));
 
                 corners.push_back(Point2f((float)c.x, (float)c.y));
+                cornersQuality.push_back(c.val);
                 ++ncorners;
 
                 if( maxCorners > 0 && (int)ncorners == maxCorners )
@@ -251,13 +254,19 @@ static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
             const Corner & c = corner_ptr[i];
 
             corners.push_back(Point2f((float)c.x, (float)c.y));
+            cornersQuality.push_back(c.val);
             ++ncorners;
+
             if( maxCorners > 0 && (int)ncorners == maxCorners )
                 break;
         }
     }
 
     Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
+    if (_cornersQuality.needed()) {
+        Mat(cornersQuality).convertTo(_cornersQuality, _cornersQuality.fixedType() ? _cornersQuality.type() : CV_32F);
+    }
+
     return true;
 }
 
@@ -354,9 +363,25 @@ static bool openvx_harris(Mat image, OutputArray _corners,
 
 }
 
+void cv::goodFeaturesToTrack( InputArray image, OutputArray corners,
+                              int maxCorners, double qualityLevel, double minDistance,
+                              InputArray mask, int blockSize, bool useHarrisDetector, double k )
+{
+    return goodFeaturesToTrack(image, corners, maxCorners, qualityLevel, minDistance,
+                               mask, noArray(), blockSize, 3, useHarrisDetector, k);
+}
+
+void cv::goodFeaturesToTrack( InputArray image, OutputArray corners,
+                              int maxCorners, double qualityLevel, double minDistance,
+                              InputArray mask, int blockSize, int gradientSize, bool useHarrisDetector, double k )
+{
+    return goodFeaturesToTrack( image, corners, maxCorners, qualityLevel, minDistance,
+                                mask, noArray(), blockSize, gradientSize, useHarrisDetector, k );
+}
+
 void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                               int maxCorners, double qualityLevel, double minDistance,
-                              InputArray _mask, int blockSize, int gradientSize,
+                              InputArray _mask, OutputArray _cornersQuality, int blockSize, int gradientSize,
                               bool useHarrisDetector, double harrisK )
 {
     CV_INSTRUMENT_REGION();
@@ -366,12 +391,13 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
 
     CV_OCL_RUN(_image.dims() <= 2 && _image.isUMat(),
                ocl_goodFeaturesToTrack(_image, _corners, maxCorners, qualityLevel, minDistance,
-                                    _mask, blockSize, gradientSize, useHarrisDetector, harrisK))
+                                       _mask, _cornersQuality, blockSize, gradientSize, useHarrisDetector, harrisK))
 
     Mat image = _image.getMat(), eig, tmp;
     if (image.empty())
     {
         _corners.release();
+        _cornersQuality.release();
         return;
     }
 
@@ -410,11 +436,13 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     }
 
     std::vector<Point2f> corners;
+    std::vector<float> cornersQuality;
     size_t i, j, total = tmpCorners.size(), ncorners = 0;
 
     if (total == 0)
     {
         _corners.release();
+        _cornersQuality.release();
         return;
     }
 
@@ -485,6 +513,8 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
             {
                 grid[y_cell*grid_width + x_cell].push_back(Point2f((float)x, (float)y));
 
+                cornersQuality.push_back(*tmpCorners[i]);
+
                 corners.push_back(Point2f((float)x, (float)y));
                 ++ncorners;
 
@@ -497,26 +527,24 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     {
         for( i = 0; i < total; i++ )
         {
+            cornersQuality.push_back(*tmpCorners[i]);
+
             int ofs = (int)((const uchar*)tmpCorners[i] - eig.ptr());
             int y = (int)(ofs / eig.step);
             int x = (int)((ofs - y*eig.step)/sizeof(float));
 
             corners.push_back(Point2f((float)x, (float)y));
             ++ncorners;
+
             if( maxCorners > 0 && (int)ncorners == maxCorners )
                 break;
         }
     }
 
     Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
+    if (_cornersQuality.needed()) {
+        Mat(cornersQuality).convertTo(_cornersQuality, _cornersQuality.fixedType() ? _cornersQuality.type() : CV_32F);
+    }
 }
 
-void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
-                              int maxCorners, double qualityLevel, double minDistance,
-                              InputArray _mask, int blockSize,
-                              bool useHarrisDetector, double harrisK )
-{
-    cv::goodFeaturesToTrack(_image, _corners, maxCorners, qualityLevel, minDistance,
-                              _mask, blockSize, 3, useHarrisDetector,  harrisK );
-}
 /* End of file. */
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 9401bd5831..d27e80481d 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -446,7 +446,7 @@ struct RemapVec_8u
     {
         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
 
-        if( (cn != 1 && cn != 3 && cn != 4) || sstep > 0x8000 )
+        if( (cn != 1 && cn != 3 && cn != 4) || sstep >= 0x8000 )
             return 0;
 
         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
@@ -2167,7 +2167,8 @@ public:
     virtual void operator() (const Range& range) const CV_OVERRIDE
     {
         const int BLOCK_SZ = 64;
-        short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
+        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
+        short *XY = __XY.data(), *A = __A.data();
         const int AB_BITS = MAX(10, (int)INTER_BITS);
         const int AB_SCALE = 1 << AB_BITS;
         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
@@ -2189,7 +2190,7 @@ public:
                 int bw = std::min( bw0, dst.cols - x);
                 int bh = std::min( bh0, range.end - y);
 
-                Mat _XY(bh, bw, CV_16SC2, XY), matA;
+                Mat _XY(bh, bw, CV_16SC2, XY);
                 Mat dpart(dst, Rect(x, y, bw, bh));
 
                 for( y1 = 0; y1 < bh; y1++ )
@@ -2978,7 +2979,7 @@ public:
                 int bw = std::min( bw0, width - x);
                 int bh = std::min( bh0, range.end - y); // height
 
-                Mat _XY(bh, bw, CV_16SC2, XY), matA;
+                Mat _XY(bh, bw, CV_16SC2, XY);
                 Mat dpart(dst, Rect(x, y, bw, bh));
 
                 for( y1 = 0; y1 < bh; y1++ )
diff --git a/modules/imgproc/src/intelligent_scissors.cpp b/modules/imgproc/src/intelligent_scissors.cpp
new file mode 100644
index 0000000000..38acfd79e3
--- /dev/null
+++ b/modules/imgproc/src/intelligent_scissors.cpp
@@ -0,0 +1,772 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+
+#include "precomp.hpp"
+//#include "opencv2/imgproc/segmentation.hpp"
+
+#include <opencv2/core/utils/logger.hpp>
+
+#include <queue>  // std::priority_queue
+
+namespace cv {
+namespace segmentation {
+
+namespace {
+
+// 0 1 2
+// 3 x 4
+// 5 6 7
+static const int neighbors[8][2] = {
+    { -1, -1 },
+    {  0, -1 },
+    {  1, -1 },
+    { -1,  0 },
+    {  1,  0 },
+    { -1,  1 },
+    {  0,  1 },
+    {  1,  1 },
+};
+
+// encoded reverse direction
+static const int neighbors_encode[8] = {
+    7+1, 6+1, 5+1,
+    4+1,      3+1,
+    2+1, 1+1, 0+1
+};
+
+#define ACOS_TABLE_SIZE 64
+// acos_table[x + ACOS_TABLE_SIZE] = acos(x / ACOS_TABLE_SIZE) / CV_PI (see local_cost)
+//    x = [ -ACOS_TABLE_SIZE .. ACOS_TABLE_SIZE ]
+float* getAcosTable()
+{
+    constexpr int N = ACOS_TABLE_SIZE;
+    static bool initialized = false;
+    static float acos_table[2*N + 1] = { 0 };
+    if (!initialized)
+    {
+        const float CV_PI_inv = static_cast<float>(1.0 / CV_PI);
+        for (int i = -N; i <= N; i++)
+        {
+           acos_table[i + N] = acosf(i / (float)N) * CV_PI_inv;
+        }
+        initialized = true;
+    }
+    return acos_table;
+}
+
+} // namespace anon
+
+struct IntelligentScissorsMB::Impl
+{
+    // proposed weights from the article (sum = 1.0)
+    float weight_non_edge = 0.43f;
+    float weight_gradient_direction = 0.43f;
+    float weight_gradient_magnitude = 0.14f;
+
+    enum EdgeFeatureMode {
+        FEATURE_ZERO_CROSSING = 0,
+        FEATURE_CANNY
+    };
+    EdgeFeatureMode edge_mode = FEATURE_ZERO_CROSSING;
+
+    // FEATURE_ZERO_CROSSING
+    float edge_gradient_magnitude_min_value = 0.0f;
+
+    // FEATURE_CANNY
+    double edge_canny_threshold1 = 10;
+    double edge_canny_threshold2 = 100;
+    int edge_canny_apertureSize = 3;
+    bool edge_canny_L2gradient = false;
+
+
+    float gradient_magnitude_threshold_max = 0.0f;  // disabled thresholding
+
+    int sobelKernelSize = 3;  // 1 or 3
+    int laplacianKernelSize = 3;  // 1 or 3
+
+    // image features
+    Mat_<Point2f> gradient_direction;  //< I: normalized laplacian x/y components
+    Mat_<float> gradient_magnitude;  //< Fg: gradient cost function
+    Mat_<uchar> non_edge_feature;  //< Fz: zero-crossing function
+
+    float weight_non_edge_compute = 0.0f;
+
+    // encoded paths map (produced by `buildMap()`)
+    Mat_<uchar> optimalPathsMap;
+
+    void resetFeatures_()
+    {
+        CV_TRACE_FUNCTION();
+
+        gradient_direction.release();
+        gradient_magnitude.release();
+        non_edge_feature.release();
+
+        weight_non_edge_compute = weight_non_edge;
+
+        optimalPathsMap.release();
+    }
+
+    Size src_size;
+    Mat image_;
+    Mat grayscale_;
+    void initImage_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+        if (!image_.empty())
+            return;
+        CV_CheckType(image.type(), image.type() == CV_8UC1 || image.type() == CV_8UC3 || image.type() == CV_8UC4, "");
+        src_size = image.size();
+        image_ = image.getMat();
+    }
+    void initGrayscale_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+        if (!grayscale_.empty())
+            return;
+        CV_Assert(!image.empty());
+        CV_CheckType(image.type(), image.type() == CV_8UC1 || image.type() == CV_8UC3 || image.type() == CV_8UC4, "");
+        src_size = image.size();
+        if (image.channels() > 1)
+            cvtColor(image, grayscale_, COLOR_BGR2GRAY);
+        else
+            grayscale_ = image.getMat();
+    }
+    Mat Ix_, Iy_;
+    void initImageDerives_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+        if (!Ix_.empty())
+            return;
+        initGrayscale_(image);
+        Sobel(grayscale_, Ix_, CV_32FC1, 1, 0, sobelKernelSize);
+        Sobel(grayscale_, Iy_, CV_32FC1, 0, 1, sobelKernelSize);
+    }
+    Mat image_magnitude_;
+    void initImageMagnitude_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+        if (!image_magnitude_.empty())
+            return;
+        initImageDerives_(image);
+        magnitude(Ix_, Iy_, image_magnitude_);
+    }
+
+    void cleanupFeaturesTemporaryArrays_()
+    {
+        CV_TRACE_FUNCTION();
+        image_.release();
+        grayscale_.release();
+        Ix_.release();
+        Iy_.release();
+        image_magnitude_.release();
+    }
+
+    Impl()
+    {
+        // nothing
+        CV_TRACE_FUNCTION();
+    }
+
+    void setWeights(float weight_non_edge_, float weight_gradient_direction_, float weight_gradient_magnitude_)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_CheckGE(weight_non_edge_, 0.0f, "");
+        CV_CheckGE(weight_gradient_direction_, 0.0f, "");
+        CV_CheckGE(weight_gradient_magnitude_, 0.0f, "");
+        CV_CheckGE(weight_non_edge_ + weight_gradient_direction_ + weight_gradient_magnitude_, FLT_EPSILON, "Sum of weights must be greater than zero");
+        weight_non_edge = weight_non_edge_;
+        weight_gradient_direction = weight_gradient_direction_;
+        weight_gradient_magnitude = weight_gradient_magnitude_;
+        resetFeatures_();
+    }
+
+    void setGradientMagnitudeMaxLimit(float gradient_magnitude_threshold_max_)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_CheckGE(gradient_magnitude_threshold_max_, 0.0f, "");
+        gradient_magnitude_threshold_max = gradient_magnitude_threshold_max_;
+        resetFeatures_();
+    }
+
+    void setEdgeFeatureZeroCrossingParameters(float gradient_magnitude_min_value_)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_CheckGE(gradient_magnitude_min_value_, 0.0f, "");
+        edge_mode = FEATURE_ZERO_CROSSING;
+        edge_gradient_magnitude_min_value = gradient_magnitude_min_value_;
+        resetFeatures_();
+    }
+
+    void setEdgeFeatureCannyParameters(
+            double threshold1, double threshold2,
+            int apertureSize = 3, bool L2gradient = false
+    )
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_CheckGE(threshold1, 0.0, "");
+        CV_CheckGE(threshold2, 0.0, "");
+        edge_mode = FEATURE_CANNY;
+        edge_canny_threshold1 = threshold1;
+        edge_canny_threshold2 = threshold2;
+        edge_canny_apertureSize = apertureSize;
+        edge_canny_L2gradient = L2gradient;
+        resetFeatures_();
+    }
+
+    void applyImageFeatures(
+            InputArray non_edge, InputArray gradient_direction_, InputArray gradient_magnitude_,
+            InputArray image
+    )
+    {
+        CV_TRACE_FUNCTION();
+
+        resetFeatures_();
+        cleanupFeaturesTemporaryArrays_();
+
+        src_size = Size(0, 0);
+        if (!non_edge.empty())
+            src_size = non_edge.size();
+        if (!gradient_direction_.empty())
+        {
+            Size gradient_direction_size = gradient_direction_.size();
+            if (!src_size.empty())
+                CV_CheckEQ(src_size, gradient_direction_size, "");
+            else
+                src_size = gradient_direction_size;
+        }
+        if (!gradient_magnitude_.empty())
+        {
+            Size gradient_magnitude_size = gradient_magnitude_.size();
+            if (!src_size.empty())
+                CV_CheckEQ(src_size, gradient_magnitude_size, "");
+            else
+                src_size = gradient_magnitude_size;
+        }
+        if (!image.empty())
+        {
+            Size image_size = image.size();
+            if (!src_size.empty())
+                CV_CheckEQ(src_size, image_size, "");
+            else
+                src_size = image_size;
+        }
+        // src_size must be filled
+        CV_Assert(!src_size.empty());
+
+        if (!non_edge.empty())
+        {
+            CV_CheckTypeEQ(non_edge.type(), CV_8UC1, "");
+            non_edge_feature = non_edge.getMat();
+        }
+        else
+        {
+            if (weight_non_edge == 0.0f)
+            {
+                non_edge_feature.create(src_size);
+                non_edge_feature.setTo(0);
+            }
+            else
+            {
+                if (image.empty())
+                    CV_Error(Error::StsBadArg, "Non-edge feature parameter is missing. Input image parameter is required to extract this feature");
+                extractEdgeFeature_(image);
+            }
+        }
+
+        if (!gradient_direction_.empty())
+        {
+            CV_CheckTypeEQ(gradient_direction_.type(), CV_32FC2, "");
+            gradient_direction = gradient_direction_.getMat();
+        }
+        else
+        {
+            if (weight_gradient_direction == 0.0f)
+            {
+                gradient_direction.create(src_size);
+                gradient_direction.setTo(Scalar::all(0));
+            }
+            else
+            {
+                if (image.empty())
+                    CV_Error(Error::StsBadArg, "Gradient direction feature parameter is missing. Input image parameter is required to extract this feature");
+                extractGradientDirection_(image);
+            }
+        }
+
+        if (!gradient_magnitude_.empty())
+        {
+            CV_CheckTypeEQ(gradient_magnitude_.type(), CV_32FC1, "");
+            gradient_magnitude = gradient_magnitude_.getMat();
+        }
+        else
+        {
+            if (weight_gradient_magnitude == 0.0f)
+            {
+                gradient_magnitude.create(src_size);
+                gradient_magnitude.setTo(Scalar::all(0));
+            }
+            else
+            {
+                if (image.empty())
+                    CV_Error(Error::StsBadArg, "Gradient magnitude feature parameter is missing. Input image parameter is required to extract this feature");
+                extractGradientMagnitude_(image);
+            }
+        }
+
+        cleanupFeaturesTemporaryArrays_();
+    }
+
+
+    void extractEdgeFeature_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+
+        if (edge_mode == FEATURE_CANNY)
+        {
+            CV_LOG_DEBUG(NULL, "Canny(" << edge_canny_threshold1 << ", " << edge_canny_threshold2 << ")");
+            Mat img_canny;
+            Canny(image, img_canny, edge_canny_threshold1, edge_canny_threshold2, edge_canny_apertureSize, edge_canny_L2gradient);
+#if 0
+            threshold(img_canny, non_edge_feature, 254, 1, THRESH_BINARY_INV);
+#else
+            // Canny result values are 0 or 255
+            bitwise_not(img_canny, non_edge_feature);
+            weight_non_edge_compute = weight_non_edge * (1.0f / 255.0f);
+#endif
+        }
+        else // if (edge_mode == FEATURE_ZERO_CROSSING)
+        {
+            initGrayscale_(image);
+            Mat_<short> laplacian;
+            Laplacian(grayscale_, laplacian, CV_16S, laplacianKernelSize);
+            Mat_<uchar> zero_crossing(src_size, 1);
+
+            const size_t zstep = zero_crossing.step[0];
+            for (int y = 0; y < src_size.height - 1; y++)
+            {
+                const short* row0 = laplacian.ptr<short>(y);
+                const short* row1 = laplacian.ptr<short>(y + 1);
+                uchar* zrow0 = zero_crossing.ptr<uchar>(y);
+                //uchar* zrow1 = zero_crossing.ptr<uchar>(y + 1);
+                for (int x = 0; x < src_size.width - 1; x++)
+                {
+                    const int v = row0[x];
+                    const int neg_v = -v;
+                    //  - * 1
+                    //  2 3 4
+                    const int v1 = row0[x + 1];
+                    const int v2 = (x > 0) ? row1[x - 1] : v;
+                    const int v3 = row1[x + 0];
+                    const int v4 = row1[x + 1];
+                    if (v < 0)
+                    {
+                        if (v1 > 0)
+                        {
+                            zrow0[x + ((v1 < neg_v) ? 1 : 0)] = 0;
+                        }
+                        if (v2 > 0)
+                        {
+                            zrow0[x + ((v2 < neg_v) ? (zstep - 1) : 0)] = 0;
+                        }
+                        if (v3 > 0)
+                        {
+                            zrow0[x + ((v3 < neg_v) ? (zstep + 0) : 0)] = 0;
+                        }
+                        if (v4 > 0)
+                        {
+                            zrow0[x + ((v4 < neg_v) ? (zstep + 1) : 0)] = 0;
+                        }
+                    }
+                    else
+                    {
+                        if (v1 < 0)
+                        {
+                            zrow0[x + ((v1 > neg_v) ? 1 : 0)] = 0;
+                        }
+                        if (v2 < 0)
+                        {
+                            zrow0[x + ((v2 > neg_v) ? (zstep - 1) : 0)] = 0;
+                        }
+                        if (v3 < 0)
+                        {
+                            zrow0[x + ((v3 > neg_v) ? (zstep + 0) : 0)] = 0;
+                        }
+                        if (v4 < 0)
+                        {
+                            zrow0[x + ((v4 > neg_v) ? (zstep + 1) : 0)] = 0;
+                        }
+                    }
+                }
+            }
+
+            if (edge_gradient_magnitude_min_value > 0)
+            {
+                initImageMagnitude_(image);
+                Mat mask = image_magnitude_ < edge_gradient_magnitude_min_value;
+                zero_crossing.setTo(1, mask);  // reset low-amplitude noise
+            }
+
+            non_edge_feature = zero_crossing;
+        }
+    }
+
+
+    void extractGradientDirection_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+
+        initImageMagnitude_(image);  // calls internally: initImageDerives_(image);
+        gradient_direction.create(src_size);
+        for (int y = 0; y < src_size.height; y++)
+        {
+            const float* magnutude_row = image_magnitude_.ptr<float>(y);
+            const float* Ix_row = Ix_.ptr<float>(y);
+            const float* Iy_row = Iy_.ptr<float>(y);
+            Point2f* gradient_direction_row = gradient_direction.ptr<Point2f>(y);
+            for (int x = 0; x < src_size.width; x++)
+            {
+                const float m = magnutude_row[x];
+                if (m > FLT_EPSILON)
+                {
+                    float m_inv = 1.0f / m;
+                    gradient_direction_row[x] = Point2f(Ix_row[x] * m_inv, Iy_row[x] * m_inv);
+                }
+                else
+                {
+                    gradient_direction_row[x] = Point2f(0, 0);
+                }
+            }
+        }
+    }
+
+    void extractGradientMagnitude_(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+
+        initImageMagnitude_(image);  // calls internally: initImageDerives_(image);
+        Mat m;
+        double max_m = 0;
+        if (gradient_magnitude_threshold_max > 0)
+        {
+            threshold(image_magnitude_, m, gradient_magnitude_threshold_max, 0, THRESH_TRUNC);
+            max_m = gradient_magnitude_threshold_max;
+        }
+        else
+        {
+            m = image_magnitude_;
+            minMaxLoc(m, 0, &max_m);
+        }
+        if (max_m <= FLT_EPSILON)
+        {
+            CV_LOG_INFO(NULL, "IntelligentScissorsMB: input image gradient is almost zero")
+            gradient_magnitude.create(src_size);
+            gradient_magnitude.setTo(0);
+        }
+        else
+        {
+            m.convertTo(gradient_magnitude, CV_32F, -1.0 / max_m, 1.0);  // normalize and inverse to range 0..1
+        }
+    }
+
+    void applyImage(InputArray image)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_CheckType(image.type(), image.type() == CV_8UC1 || image.type() == CV_8UC3 || image.type() == CV_8UC4, "");
+
+        resetFeatures_();
+        cleanupFeaturesTemporaryArrays_();
+        extractEdgeFeature_(image);
+        extractGradientDirection_(image);
+        extractGradientMagnitude_(image);
+        cleanupFeaturesTemporaryArrays_();
+    }
+
+
+    // details: see section 3.1 of the article
+    const float* acos_table = getAcosTable();
+    float local_cost(const Point& p, const Point& q) const
+    {
+        const bool isDiag = (p.x != q.x) && (p.y != q.y);
+
+        float fG = gradient_magnitude.at<float>(q);
+
+        const Point2f diff((float)(q.x - p.x), (float)(q.y - p.y));
+
+        const Point2f Ip = gradient_direction(p);
+        const Point2f Iq = gradient_direction(q);
+
+        const Point2f Dp(Ip.y, -Ip.x);  // D(p) - 90 degrees clockwise
+        const Point2f Dq(Iq.y, -Iq.x);  // D(q) - 90 degrees clockwise
+
+        float dp = Dp.dot(diff);  // dp(p, q)
+        float dq = Dq.dot(diff);  // dq(p, q)
+        if (dp < 0)
+        {
+            dp = -dp;  // ensure dp >= 0
+            dq = -dq;
+        }
+
+        const float sqrt2_inv = 0.7071067811865475f; // 1.0 / sqrt(2)
+        if (isDiag)
+        {
+            dp *= sqrt2_inv;  // normalize length of (q - p)
+            dq *= sqrt2_inv;  // normalize length of (q - p)
+        }
+        else
+        {
+            fG *= sqrt2_inv;
+        }
+
+#if 1
+        int dp_i = cvFloor(dp * ACOS_TABLE_SIZE);  // dp is in range 0..1
+        dp_i = std::min(ACOS_TABLE_SIZE, std::max(0, dp_i));
+        int dq_i = cvFloor(dq * ACOS_TABLE_SIZE);  // dq is in range -1..1
+        dq_i = std::min(ACOS_TABLE_SIZE, std::max(-ACOS_TABLE_SIZE, dq_i));
+        const float fD = acos_table[dp_i + ACOS_TABLE_SIZE] + acos_table[dq_i + ACOS_TABLE_SIZE];
+#else
+        const float CV_PI_inv = static_cast<float>(1.0 / CV_PI);
+        const float fD = (acosf(dp) + acosf(dq)) * CV_PI_inv;  // TODO optimize acos calls (through tables)
+#endif
+
+        float cost =
+            weight_non_edge_compute * non_edge_feature.at<uchar>(q) +
+            weight_gradient_direction * fD +
+            weight_gradient_magnitude * fG;
+        return cost;
+    }
+
+    struct Pix
+    {
+        Point pt;
+        float cost;  // NOTE: do not remove cost from here through replacing by cost(pt) map access
+
+        inline bool operator > (const Pix &b) const
+        {
+            return cost > b.cost;
+        }
+    };
+
+    void buildMap(const Point& start_point)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_Assert(!src_size.empty());
+        CV_Assert(!gradient_magnitude.empty() && "Features are missing. applyImage() must be called first");
+
+        CV_CheckGE(weight_non_edge + weight_gradient_direction + weight_gradient_magnitude, FLT_EPSILON, "");
+
+#if 0  // debug
+        Rect wholeImage(0, 0, src_size.width, src_size.height);
+        Rect roi = Rect(start_point.x - 5, start_point.y - 5, 11, 11) & wholeImage;
+        std::cout << roi << std::endl;
+        std::cout << gradient_magnitude(roi) << std::endl;
+        std::cout << gradient_direction(roi) << std::endl;
+        std::cout << non_edge_feature(roi) << std::endl;
+#endif
+
+        optimalPathsMap.release();
+        optimalPathsMap.create(src_size);
+        optimalPathsMap.setTo(0);  // optimalPathsMap(start_point) = 0;
+
+        //
+        // Section 3.2
+        // Live-Wire 2-D DP graph search.
+        //
+
+        Mat_<float> cost_map(src_size, FLT_MAX);  // g(q)
+        Mat_<uchar> processed(src_size, (uchar)0);  // e(q)
+
+        // Note: std::vector is faster than std::deque
+        // TODO check std::set
+        std::priority_queue< Pix, std::vector<Pix>, std::greater<Pix> > L;
+
+        cost_map(start_point) = 0;
+        L.emplace(Pix{ start_point, 0/*cost*/ });
+
+        while (!L.empty())
+        {
+            Pix pix = L.top(); L.pop();
+            Point q = pix.pt;  // 'q' from the article
+            if (processed(q))
+                continue;  // already processed (with lower cost, see note below)
+            processed(q) = 1;
+#if 1
+            const float cost_q = pix.cost;
+#else
+            const float cost_q = cost_map(q);
+            CV_Assert(cost_q == pix.cost);
+#endif
+            for (int n = 0; n < 8; n++)  // scan neighbours
+            {
+                Point r(q.x + neighbors[n][0], q.y + neighbors[n][1]);  // 'r' from the article
+                if (r.x < 0 || r.x >= src_size.width || r.y < 0 || r.y >= src_size.height)
+                    continue;  // out of range
+
+#if !defined(__EMSCRIPTEN__)  // slower in JS
+                float& cost_r = cost_map(r);
+                if (cost_r < cost_q)
+                    continue;  // already processed
+#else
+                if (processed(r))
+                    continue;  // already processed
+
+                float& cost_r = cost_map(r);
+                CV_DbgCheckLE(cost_q, cost_r, "INTERNAL ERROR: sorted queue is corrupted");
+#endif
+
+                float cost = cost_q + local_cost(q, r);  // TODO(opt): compute partially until cost < cost_r
+                if (cost < cost_r)
+                {
+#if 0  // avoid compiler warning
+                    if (cost_r != FLT_MAX)
+                    {
+                        // In article the point 'r' is removed from the queue L
+                        // to be re-inserted again with sorting against new optimized cost.
+                        // We can do nothing, because "new point" will be placed before in the sorted queue.
+                        // Old point will be skipped through "if (processed(q))" check above after processing of new optimal candidate.
+                        //
+                        // This approach leads to some performance impact, however it is much smaller than element removal from the sorted queue.
+                        // So, do nothing.
+                    }
+#endif
+                    cost_r = cost;
+                    L.emplace(Pix{ r, cost });
+                    optimalPathsMap(r) = (uchar)neighbors_encode[n];
+                }
+            }
+        }
+    }
+
+    void getContour(const Point& target, OutputArray contour_, bool backward)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_Assert(!optimalPathsMap.empty() && "buildMap() must be called before getContour()");
+
+        const int cols = optimalPathsMap.cols;
+        const int rows = optimalPathsMap.rows;
+
+        std::vector<Point> result; result.reserve(512);
+
+        size_t loop_check = 4096;
+        Point pt = target;
+        for (size_t i = 0; i < (size_t)rows * cols; i++)  // don't hang on invalid maps
+        {
+            CV_CheckLT(pt.x, cols, "");
+            CV_CheckLT(pt.y, rows, "");
+            result.push_back(pt);
+            int direction = (int)optimalPathsMap(pt);
+            if (direction == 0)
+                break;  // stop, start point is reached
+            CV_CheckLT(direction, 9, "Map is invalid");
+            Point next(pt.x + neighbors[direction - 1][0], pt.y + neighbors[direction - 1][1]);
+            pt = next;
+
+            if (result.size() == loop_check)  // optional sanity check of invalid maps with loops (don't eat huge amount of memory)
+            {
+                loop_check *= 4;  // next limit for loop check
+                for (const auto& pt_check : result)
+                {
+                    CV_CheckNE(pt_check, pt, "Map is invalid. Contour loop is detected");
+                }
+            }
+        }
+
+        if (backward)
+        {
+            _InputArray(result).copyTo(contour_);
+        }
+        else
+        {
+            const int N = (int)result.size();
+            const int sz[1] = { N };
+            contour_.create(1, sz, CV_32SC2);
+            Mat_<Point> contour = contour_.getMat();
+            for (int i = 0; i < N; i++)
+            {
+                contour.at<Point>(i) = result[N - (i + 1)];
+            }
+        }
+    }
+};
+
+
+
+IntelligentScissorsMB::IntelligentScissorsMB()
+    : impl(std::make_shared<Impl>())
+{
+    // nothing
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::setWeights(float weight_non_edge, float weight_gradient_direction, float weight_gradient_magnitude)
+{
+    CV_DbgAssert(impl);
+    impl->setWeights(weight_non_edge, weight_gradient_direction, weight_gradient_magnitude);
+    return *this;
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::setGradientMagnitudeMaxLimit(float gradient_magnitude_threshold_max)
+{
+    CV_DbgAssert(impl);
+    impl->setGradientMagnitudeMaxLimit(gradient_magnitude_threshold_max);
+    return *this;
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::setEdgeFeatureZeroCrossingParameters(float gradient_magnitude_min_value)
+{
+    CV_DbgAssert(impl);
+    impl->setEdgeFeatureZeroCrossingParameters(gradient_magnitude_min_value);
+    return *this;
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::setEdgeFeatureCannyParameters(
+        double threshold1, double threshold2,
+        int apertureSize, bool L2gradient
+)
+{
+    CV_DbgAssert(impl);
+    impl->setEdgeFeatureCannyParameters(threshold1, threshold2, apertureSize, L2gradient);
+    return *this;
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::applyImage(InputArray image)
+{
+    CV_DbgAssert(impl);
+    impl->applyImage(image);
+    return *this;
+}
+
+IntelligentScissorsMB& IntelligentScissorsMB::applyImageFeatures(
+        InputArray non_edge, InputArray gradient_direction, InputArray gradient_magnitude,
+        InputArray image
+)
+{
+    CV_DbgAssert(impl);
+    impl->applyImageFeatures(non_edge, gradient_direction, gradient_magnitude, image);
+    return *this;
+}
+
+void IntelligentScissorsMB::buildMap(const Point& pt)
+{
+    CV_DbgAssert(impl);
+    impl->buildMap(pt);
+}
+
+void IntelligentScissorsMB::getContour(const Point& target, OutputArray contour, bool backward) const
+{
+    CV_DbgAssert(impl);
+    impl->getContour(target, contour, backward);
+}
+
+}}  // namespace
diff --git a/modules/imgproc/src/intersection.cpp b/modules/imgproc/src/intersection.cpp
index 84dbc8b8f1..3f749896a4 100644
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@@ -54,6 +54,12 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
     // L2 metric
     const float samePointEps = std::max(1e-16f, 1e-6f * (float)std::max(rect1.size.area(), rect2.size.area()));
 
+    if (rect1.size.empty() || rect2.size.empty())
+    {
+        intersectingRegion.release();
+        return INTERSECT_NONE;
+    }
+
     Point2f vec1[4], vec2[4];
     Point2f pts1[4], pts2[4];
 
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 2625533f76..47d6251ae4 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -750,9 +750,9 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
     Size ssize = _src.size(), dsize = _dst.size();
     int cn = _src.channels();
 
-    int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
-    AutoBuffer<int> _tabM(dsize.width*cn);
-    int* tabM = _tabM.data();
+    AutoBuffer<int> _tabM(dsize.width * cn), _tabL(cn * (PD_SZ + 2)),
+        _tabR(cn * (PD_SZ + 2));
+    int *tabM = _tabM.data(), *tabL = _tabL.data(), *tabR = _tabR.data();
 
     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
                std::abs(dsize.width*2 - ssize.width) <= 2 &&
diff --git a/modules/imgproc/src/rotcalipers.cpp b/modules/imgproc/src/rotcalipers.cpp
index 6a327b5b39..527f71a247 100644
--- a/modules/imgproc/src/rotcalipers.cpp
+++ b/modules/imgproc/src/rotcalipers.cpp
@@ -352,7 +352,7 @@ cv::RotatedRect cv::minAreaRect( InputArray _points )
     Point2f out[3];
     RotatedRect box;
 
-    convexHull(_points, hull, true, true);
+    convexHull(_points, hull, false, true);
 
     if( hull.depth() != CV_32F )
     {
diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp
index 2a7a8e72bb..6c41b45e9f 100644
--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@@ -1197,6 +1197,78 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
             }
     }
 }
+template <>
+void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn, const ufixedpoint32* m, int n, ufixedpoint32* dst, int len, int borderType)
+{
+    int pre_shift = n / 2;
+    int post_shift = n - pre_shift;
+    int i = 0;
+    for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[pre_shift - i] * src[k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+        int j, mid;
+        for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[mid] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)
+            for (; j < i + post_shift; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+    }
+    i *= cn;
+    int lencn = (len - post_shift + 1)*cn;
+#if CV_SIMD
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2)
+    {
+        v_uint32 v_res0, v_res1;
+        v_mul_expand(vx_load(src + pre_shift * cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + pre_shift))), v_res0, v_res1);
+        for (int j = 0; j < pre_shift; j ++)
+        {
+            v_uint32 v_add0, v_add1;
+            v_mul_expand(vx_load(src + j * cn) + vx_load(src + (n - 1 - j)*cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + j))), v_add0, v_add1);
+            v_res0 += v_add0;
+            v_res1 += v_add1;
+        }
+        v_store((uint32_t*)dst, v_res0);
+        v_store((uint32_t*)dst + VECSZ, v_res1);
+    }
+#endif
+    for (; i < lencn; i++, src++, dst++)
+    {
+        *dst = m[pre_shift] * src[pre_shift*cn];
+        for (int j = 0; j < pre_shift; j++)
+            *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn];
+    }
+    i /= cn;
+    for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[0] * src[k];
+        int j = 1;
+        for (; j < len - i; j++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[j] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (; j < n; j++)
+            {
+                int src_idx = borderInterpolate(i + j, len, borderType) - i;
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[j] * src[src_idx*cn + k];
+            }
+    }
+}
 template <typename ET, typename FT>
 void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len)
 {
@@ -1788,6 +1860,62 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
         dst[i] = val;
     }
 }
+template <>
+void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len)
+{
+    int i = 0;
+#if CV_SIMD
+    int pre_shift = n / 2;
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
+    {
+        v_uint32 v_src00, v_src10, v_src01, v_src11;
+        v_uint64 v_res0, v_res1, v_res2, v_res3;
+        v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;
+
+        v_uint32 v_mul = vx_setall_u32(*((uint32_t*)(m + pre_shift)));
+        const uint32_t* srcp = (const uint32_t*)src[pre_shift] + i;
+        v_src00 = vx_load(srcp);
+        v_src10 = vx_load(srcp + VECSZ);
+        v_mul_expand(v_src00, v_mul, v_res0, v_res1);
+        v_mul_expand(v_src10, v_mul, v_res2, v_res3);
+
+        int j = 0;
+        for (; j < pre_shift; j++)
+        {
+            v_mul = vx_setall_u32(*((uint32_t*)(m + j)));
+
+            const uint32_t* srcj0 = (const uint32_t*)src[j] + i;
+            const uint32_t* srcj1 = (const uint32_t*)src[n - 1 - j] + i;
+            v_src00 = vx_load(srcj0);
+            v_src01 = vx_load(srcj1);
+            v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1);
+            v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3);
+            v_res0 += v_tmp0 + v_tmp2;
+            v_res1 += v_tmp1 + v_tmp3;
+
+            v_src10 = vx_load(srcj0 + VECSZ);
+            v_src11 = vx_load(srcj1 + VECSZ);
+            v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5);
+            v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7);
+            v_res2 += v_tmp4 + v_tmp6;
+            v_res3 += v_tmp5 + v_tmp7;
+        }
+
+        v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1),
+                                v_rshr_pack<32>(v_res2, v_res3)));
+    }
+#endif
+    for (; i < len; i++)
+    {
+        ufixedpoint64 val = m[0] * src[0][i];
+        for (int j = 1; j < n; j++)
+        {
+            val = val + m[j] * src[j][i];
+        }
+        dst[i] = (uint16_t)val;
+    }
+}
 template <typename ET, typename FT>
 class fixedSmoothInvoker : public ParallelLoopBody
 {
diff --git a/modules/imgproc/src/subdivision2d.cpp b/modules/imgproc/src/subdivision2d.cpp
index c254c5f9c6..99248dba77 100644
--- a/modules/imgproc/src/subdivision2d.cpp
+++ b/modules/imgproc/src/subdivision2d.cpp
@@ -229,7 +229,7 @@ int Subdiv2D::newEdge()
 {
     if( freeQEdge <= 0 )
     {
-        qedges.push_back(QuadEdge());
+        qedges.emplace_back();
         freeQEdge = (int)(qedges.size()-1);
     }
     int edge = freeQEdge*4;
diff --git a/modules/imgproc/test/ocl/test_gftt.cpp b/modules/imgproc/test/ocl/test_gftt.cpp
index 1e47cd250f..a05be2bccf 100644
--- a/modules/imgproc/test/ocl/test_gftt.cpp
+++ b/modules/imgproc/test/ocl/test_gftt.cpp
@@ -62,6 +62,7 @@ PARAM_TEST_CASE(GoodFeaturesToTrack, double, bool)
 
     TEST_DECLARE_INPUT_PARAMETER(src);
     UMat points, upoints;
+    std::vector<float> quality, uquality;
 
     virtual void SetUp()
     {
@@ -100,14 +101,16 @@ OCL_TEST_P(GoodFeaturesToTrack, Accuracy)
 
         std::vector<Point2f> upts, pts;
 
-        OCL_OFF(cv::goodFeaturesToTrack(src_roi, points, maxCorners, qualityLevel, minDistance, noArray()));
+        OCL_OFF(cv::goodFeaturesToTrack(src_roi, points, maxCorners, qualityLevel, minDistance, noArray(), quality));
         ASSERT_FALSE(points.empty());
         UMatToVector(points, pts);
 
-        OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+        OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance, noArray(), uquality));
         ASSERT_FALSE(upoints.empty());
         UMatToVector(upoints, upts);
 
+        ASSERT_EQ(pts.size(), quality.size());
+        ASSERT_EQ(upts.size(), uquality.size());
         ASSERT_EQ(upts.size(), pts.size());
 
         int mistmatch = 0;
@@ -115,7 +118,8 @@ OCL_TEST_P(GoodFeaturesToTrack, Accuracy)
         {
             Point2i a = upts[i], b = pts[i];
 
-            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1 &&
+                    std::abs(quality[i] - uquality[i]) <= 3.f * FLT_EPSILON * std::max(quality[i], uquality[i]);
 
             if (!eq)
                 ++mistmatch;
@@ -131,9 +135,10 @@ OCL_TEST_P(GoodFeaturesToTrack, EmptyCorners)
     generateTestData();
     usrc_roi.setTo(Scalar::all(0));
 
-    OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+    OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance, noArray(), uquality));
 
     ASSERT_TRUE(upoints.empty());
+    ASSERT_TRUE(uquality.empty());
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(Imgproc, GoodFeaturesToTrack,
diff --git a/modules/imgproc/test/ocl/test_imgproc.cpp b/modules/imgproc/test/ocl/test_imgproc.cpp
index f5a3fef89c..f3e9f4bb20 100644
--- a/modules/imgproc/test/ocl/test_imgproc.cpp
+++ b/modules/imgproc/test/ocl/test_imgproc.cpp
@@ -234,7 +234,12 @@ OCL_TEST_P(CornerMinEigenVal, Mat)
         OCL_OFF(cv::cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType));
         OCL_ON(cv::cornerMinEigenVal(usrc_roi, udst_roi, blockSize, apertureSize, borderType));
 
-        Near(1e-5, true);
+        // The corner kernel uses native_sqrt() which has implementation defined accuracy.
+        // If we're using a CL implementation that isn't intel, test with relaxed accuracy.
+        if (!ocl::useOpenCL() || ocl::Device::getDefault().isIntel())
+            Near(1e-5, true);
+        else
+            Near(0.1, true);
     }
 }
 
diff --git a/modules/imgproc/test/test_connectedcomponents.cpp b/modules/imgproc/test/test_connectedcomponents.cpp
index bc5c9f0a23..e57c24a937 100644
--- a/modules/imgproc/test/test_connectedcomponents.cpp
+++ b/modules/imgproc/test/test_connectedcomponents.cpp
@@ -74,11 +74,10 @@ void normalizeLabels(Mat1i& imgLabels, int iNumLabels) {
     }
 }
 
-
 void CV_ConnectedComponentsTest::run( int /* start_from */)
 {
 
-    int ccltype[] = { cv::CCL_WU, cv::CCL_DEFAULT, cv::CCL_GRANA };
+    int ccltype[] = { cv::CCL_DEFAULT, cv::CCL_WU, cv::CCL_GRANA, cv::CCL_BOLELLI, cv::CCL_SAUF, cv::CCL_BBDT, cv::CCL_SPAGHETTI };
 
     string exp_path = string(ts->get_data_path()) + "connectedcomponents/ccomp_exp.png";
     Mat exp = imread(exp_path, 0);
@@ -150,7 +149,6 @@ TEST(Imgproc_ConnectedComponents, grana_buffer_overflow)
     EXPECT_EQ(1, nbComponents);
 }
 
-
 static cv::Mat createCrashMat(int numThreads) {
     const int h = numThreads * 4 * 2 + 8;
     const double nParallelStripes = std::max(1, std::min(h / 2, numThreads * 4));
@@ -239,5 +237,124 @@ TEST(Imgproc_ConnectedComponents, missing_background_pixels)
     EXPECT_TRUE(std::isnan(centroids.at<double>(0, 1)));
 }
 
+TEST(Imgproc_ConnectedComponents, spaghetti_bbdt_sauf_stats)
+{
+    cv::Mat1b img(16, 16);
+    img << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+           0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+           0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+           0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
+           0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+           0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+           0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+           0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+           0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+           0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
+           0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
+           0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1;
+
+    cv::Mat1i labels;
+    cv::Mat1i stats;
+    cv::Mat1d centroids;
+
+    int ccltype[] = { cv::CCL_WU, cv::CCL_GRANA, cv::CCL_BOLELLI, cv::CCL_SAUF, cv::CCL_BBDT, cv::CCL_SPAGHETTI };
+
+    for (uint cclt = 0; cclt < sizeof(ccltype) / sizeof(int); ++cclt) {
+
+        EXPECT_NO_THROW(cv::connectedComponentsWithStats(img, labels, stats, centroids, 8, CV_32S, ccltype[cclt]));
+        EXPECT_EQ(stats(0, cv::CC_STAT_LEFT), 0);
+        EXPECT_EQ(stats(0, cv::CC_STAT_TOP), 0);
+        EXPECT_EQ(stats(0, cv::CC_STAT_WIDTH), 16);
+        EXPECT_EQ(stats(0, cv::CC_STAT_HEIGHT), 15);
+        EXPECT_EQ(stats(0, cv::CC_STAT_AREA), 144);
+
+        EXPECT_EQ(stats(1, cv::CC_STAT_LEFT), 1);
+        EXPECT_EQ(stats(1, cv::CC_STAT_TOP), 1);
+        EXPECT_EQ(stats(1, cv::CC_STAT_WIDTH), 3);
+        EXPECT_EQ(stats(1, cv::CC_STAT_HEIGHT), 3);
+        EXPECT_EQ(stats(1, cv::CC_STAT_AREA), 9);
+
+        EXPECT_EQ(stats(2, cv::CC_STAT_LEFT), 1);
+        EXPECT_EQ(stats(2, cv::CC_STAT_TOP), 1);
+        EXPECT_EQ(stats(2, cv::CC_STAT_WIDTH), 8);
+        EXPECT_EQ(stats(2, cv::CC_STAT_HEIGHT), 7);
+        EXPECT_EQ(stats(2, cv::CC_STAT_AREA), 40);
+
+        EXPECT_EQ(stats(3, cv::CC_STAT_LEFT), 10);
+        EXPECT_EQ(stats(3, cv::CC_STAT_TOP), 2);
+        EXPECT_EQ(stats(3, cv::CC_STAT_WIDTH), 5);
+        EXPECT_EQ(stats(3, cv::CC_STAT_HEIGHT), 2);
+        EXPECT_EQ(stats(3, cv::CC_STAT_AREA), 8);
+
+        EXPECT_EQ(stats(4, cv::CC_STAT_LEFT), 11);
+        EXPECT_EQ(stats(4, cv::CC_STAT_TOP), 5);
+        EXPECT_EQ(stats(4, cv::CC_STAT_WIDTH), 3);
+        EXPECT_EQ(stats(4, cv::CC_STAT_HEIGHT), 3);
+        EXPECT_EQ(stats(4, cv::CC_STAT_AREA), 9);
+
+        EXPECT_EQ(stats(5, cv::CC_STAT_LEFT), 2);
+        EXPECT_EQ(stats(5, cv::CC_STAT_TOP), 9);
+        EXPECT_EQ(stats(5, cv::CC_STAT_WIDTH), 1);
+        EXPECT_EQ(stats(5, cv::CC_STAT_HEIGHT), 1);
+        EXPECT_EQ(stats(5, cv::CC_STAT_AREA), 1);
+
+        EXPECT_EQ(stats(6, cv::CC_STAT_LEFT), 12);
+        EXPECT_EQ(stats(6, cv::CC_STAT_TOP), 9);
+        EXPECT_EQ(stats(6, cv::CC_STAT_WIDTH), 1);
+        EXPECT_EQ(stats(6, cv::CC_STAT_HEIGHT), 1);
+        EXPECT_EQ(stats(6, cv::CC_STAT_AREA), 1);
+
+        // Labels' order could be different!
+        if (cclt == cv::CCL_WU || cclt == cv::CCL_SAUF) {
+            // CCL_SAUF, CCL_WU
+            EXPECT_EQ(stats(9, cv::CC_STAT_LEFT), 1);
+            EXPECT_EQ(stats(9, cv::CC_STAT_TOP), 11);
+            EXPECT_EQ(stats(9, cv::CC_STAT_WIDTH), 4);
+            EXPECT_EQ(stats(9, cv::CC_STAT_HEIGHT), 2);
+            EXPECT_EQ(stats(9, cv::CC_STAT_AREA), 8);
+
+            EXPECT_EQ(stats(7, cv::CC_STAT_LEFT), 6);
+            EXPECT_EQ(stats(7, cv::CC_STAT_TOP), 10);
+            EXPECT_EQ(stats(7, cv::CC_STAT_WIDTH), 4);
+            EXPECT_EQ(stats(7, cv::CC_STAT_HEIGHT), 2);
+            EXPECT_EQ(stats(7, cv::CC_STAT_AREA), 8);
+
+            EXPECT_EQ(stats(8, cv::CC_STAT_LEFT), 0);
+            EXPECT_EQ(stats(8, cv::CC_STAT_TOP), 10);
+            EXPECT_EQ(stats(8, cv::CC_STAT_WIDTH), 16);
+            EXPECT_EQ(stats(8, cv::CC_STAT_HEIGHT), 6);
+            EXPECT_EQ(stats(8, cv::CC_STAT_AREA), 21);
+        }
+        else {
+            // CCL_BBDT, CCL_GRANA, CCL_SPAGHETTI, CCL_BOLELLI
+            EXPECT_EQ(stats(7, cv::CC_STAT_LEFT), 1);
+            EXPECT_EQ(stats(7, cv::CC_STAT_TOP), 11);
+            EXPECT_EQ(stats(7, cv::CC_STAT_WIDTH), 4);
+            EXPECT_EQ(stats(7, cv::CC_STAT_HEIGHT), 2);
+            EXPECT_EQ(stats(7, cv::CC_STAT_AREA), 8);
+
+            EXPECT_EQ(stats(8, cv::CC_STAT_LEFT), 6);
+            EXPECT_EQ(stats(8, cv::CC_STAT_TOP), 10);
+            EXPECT_EQ(stats(8, cv::CC_STAT_WIDTH), 4);
+            EXPECT_EQ(stats(8, cv::CC_STAT_HEIGHT), 2);
+            EXPECT_EQ(stats(8, cv::CC_STAT_AREA), 8);
+
+            EXPECT_EQ(stats(9, cv::CC_STAT_LEFT), 0);
+            EXPECT_EQ(stats(9, cv::CC_STAT_TOP), 10);
+            EXPECT_EQ(stats(9, cv::CC_STAT_WIDTH), 16);
+            EXPECT_EQ(stats(9, cv::CC_STAT_HEIGHT), 6);
+            EXPECT_EQ(stats(9, cv::CC_STAT_AREA), 21);
+        }
+        EXPECT_EQ(stats(10, cv::CC_STAT_LEFT), 9);
+        EXPECT_EQ(stats(10, cv::CC_STAT_TOP), 12);
+        EXPECT_EQ(stats(10, cv::CC_STAT_WIDTH), 5);
+        EXPECT_EQ(stats(10, cv::CC_STAT_HEIGHT), 2);
+        EXPECT_EQ(stats(10, cv::CC_STAT_AREA), 7);
+    }
+}
 
 }} // namespace
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 68f2f8e8fa..9c3f060e60 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -2306,5 +2306,83 @@ TEST(Imgproc_ConvexHull, overflow)
     ASSERT_EQ(hull, hullf);
 }
 
+static
+bool checkMinAreaRect(const RotatedRect& rr, const Mat& c, double eps = 0.5f)
+{
+    int N = c.rows;
+
+    Mat rr_pts;
+    boxPoints(rr, rr_pts);
+
+    double maxError = 0.0;
+    int nfailed = 0;
+    for (int i = 0; i < N; i++)
+    {
+        double d = pointPolygonTest(rr_pts, c.at<Point2f>(i), true);
+        maxError = std::max(-d, maxError);
+        if (d < -eps)
+            nfailed++;
+    }
+
+    if (nfailed)
+        std::cout << "nfailed=" << nfailed << " (total=" << N << ")   maxError=" << maxError << std::endl;
+    return nfailed == 0;
+}
+
+TEST(Imgproc_minAreaRect, reproducer_18157)
+{
+    const int N = 168;
+    float pts_[N][2] = {
+        { 1903, 266 }, { 1897, 267 }, { 1893, 268 }, { 1890, 269 },
+        { 1878, 275 }, { 1875, 277 }, { 1872, 279 }, { 1868, 282 },
+        { 1862, 287 }, { 1750, 400 }, { 1748, 402 }, { 1742, 407 },
+        { 1742, 408 }, { 1740, 410 }, { 1738, 412 }, { 1593, 558 },
+        { 1590, 560 }, { 1588, 562 }, { 1586, 564 }, { 1580, 570 },
+        { 1443, 709 }, { 1437, 714 }, { 1435, 716 }, { 1304, 848 },
+        { 1302, 850 }, { 1292, 860 }, { 1175, 979 }, { 1172, 981 },
+        { 1049, 1105 }, { 936, 1220 }, { 933, 1222 }, { 931, 1224 },
+        { 830, 1326 }, { 774, 1383 }, { 769, 1389 }, { 766, 1393 },
+        { 764, 1396 }, { 762, 1399 }, { 760, 1402 }, { 757, 1408 },
+        { 757, 1410 }, { 755, 1413 }, { 754, 1416 }, { 753, 1420 },
+        { 752, 1424 }, { 752, 1442 }, { 753, 1447 }, { 754, 1451 },
+        { 755, 1454 }, { 757, 1457 }, { 757, 1459 }, { 761, 1467 },
+        { 763, 1470 }, { 765, 1473 }, { 767, 1476 }, { 771, 1481 },
+        { 779, 1490 }, { 798, 1510 }, { 843, 1556 }, { 847, 1560 },
+        { 851, 1564 }, { 863, 1575 }, { 907, 1620 }, { 909, 1622 },
+        { 913, 1626 }, { 1154, 1866 }, { 1156, 1868 }, { 1158, 1870 },
+        { 1207, 1918 }, { 1238, 1948 }, { 1252, 1961 }, { 1260, 1968 },
+        { 1264, 1971 }, { 1268, 1974 }, { 1271, 1975 }, { 1273, 1977 },
+        { 1283, 1982 }, { 1286, 1983 }, { 1289, 1984 }, { 1294, 1985 },
+        { 1300, 1986 }, { 1310, 1986 }, { 1316, 1985 }, { 1320, 1984 },
+        { 1323, 1983 }, { 1326, 1982 }, { 1338, 1976 }, { 1341, 1974 },
+        { 1344, 1972 }, { 1349, 1968 }, { 1358, 1960 }, { 1406, 1911 },
+        { 1421, 1897 }, { 1624, 1693 }, { 1788, 1528 }, { 1790, 1526 },
+        { 1792, 1524 }, { 1794, 1522 }, { 1796, 1520 }, { 1798, 1518 },
+        { 1800, 1516 }, { 1919, 1396 }, { 1921, 1394 }, { 2038, 1275 },
+        { 2047, 1267 }, { 2048, 1265 }, { 2145, 1168 }, { 2148, 1165 },
+        { 2260, 1052 }, { 2359, 952 }, { 2434, 876 }, { 2446, 863 },
+        { 2450, 858 }, { 2453, 854 }, { 2455, 851 }, { 2457, 846 },
+        { 2459, 844 }, { 2460, 842 }, { 2460, 840 }, { 2462, 837 },
+        { 2463, 834 }, { 2464, 830 }, { 2465, 825 }, { 2465, 809 },
+        { 2464, 804 }, { 2463, 800 }, { 2462, 797 }, { 2461, 794 },
+        { 2456, 784 }, { 2454, 781 }, { 2452, 778 }, { 2450, 775 },
+        { 2446, 770 }, { 2437, 760 }, { 2412, 734 }, { 2410, 732 },
+        { 2408, 730 }, { 2382, 704 }, { 2380, 702 }, { 2378, 700 },
+        { 2376, 698 }, { 2372, 694 }, { 2370, 692 }, { 2368, 690 },
+        { 2366, 688 }, { 2362, 684 }, { 2360, 682 }, { 2252, 576 },
+        { 2250, 573 }, { 2168, 492 }, { 2166, 490 }, { 2085, 410 },
+        { 2026, 352 }, { 1988, 315 }, { 1968, 296 }, { 1958, 287 },
+        { 1953, 283 }, { 1949, 280 }, { 1946, 278 }, { 1943, 276 },
+        { 1940, 274 }, { 1936, 272 }, { 1934, 272 }, { 1931, 270 },
+        { 1928, 269 }, { 1925, 268 }, { 1921, 267 }, { 1915, 266 }
+    };
+
+    Mat contour(N, 1, CV_32FC2, (void*)pts_);
+
+    RotatedRect rr = cv::minAreaRect(contour);
+
+    EXPECT_TRUE(checkMinAreaRect(rr, contour)) << rr.center << " " << rr.size << " " << rr.angle;
+}
+
 }} // namespace
 /* End of file. */
diff --git a/modules/imgproc/test/test_drawing.cpp b/modules/imgproc/test/test_drawing.cpp
index 47a282926c..f943299585 100644
--- a/modules/imgproc/test/test_drawing.cpp
+++ b/modules/imgproc/test/test_drawing.cpp
@@ -538,7 +538,8 @@ protected:
             img->copyTo(sub);
             shift += img->size().height + 1;
         }
-        //imwrite("/tmp/all_fonts.png", result);
+        if (cvtest::debugLevel > 0)
+            imwrite("all_fonts.png", result);
     }
 };
 
diff --git a/modules/imgproc/test/test_goodfeaturetotrack.cpp b/modules/imgproc/test/test_goodfeaturetotrack.cpp
index 0ffee1e55c..d6204c0404 100644
--- a/modules/imgproc/test/test_goodfeaturetotrack.cpp
+++ b/modules/imgproc/test/test_goodfeaturetotrack.cpp
@@ -88,14 +88,13 @@ test_cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
     cvtest::filter2D( src, dy2, ftype, kernel*kernel_scale, anchor, 0, borderType,borderValue );
 
     double denom = (1 << (aperture_size-1))*block_size;
-    denom = denom * denom;
 
     if( _aperture_size < 0 )
-        denom *= 4;
+        denom *= 2.;
     if(type != ftype )
         denom *= 255.;
 
-    denom = 1./denom;
+    denom = 1. / (denom * denom);
 
     for( i = 0; i < src.rows; i++ )
     {
@@ -159,8 +158,8 @@ test_cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
 static void
 test_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                               int maxCorners, double qualityLevel, double minDistance,
-                              InputArray _mask, int blockSize, int gradientSize,
-                              bool useHarrisDetector, double harrisK )
+                              InputArray _mask, OutputArray _cornersQuality,
+                              int blockSize, int gradientSize, bool useHarrisDetector, double harrisK)
 {
 
     CV_Assert( qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0 );
@@ -208,6 +207,7 @@ test_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     }
 
     vector<Point2f> corners;
+    vector<float> cornersQuality;
     size_t i, j, total = tmpCorners.size(), ncorners = 0;
 
     std::sort( tmpCorners.begin(), tmpCorners.end(), greaterThanPtr() );
@@ -277,6 +277,8 @@ test_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
             {
                 grid[y_cell*grid_width + x_cell].push_back(Point2f((float)x, (float)y));
 
+                cornersQuality.push_back(*tmpCorners[i]);
+
                 corners.push_back(Point2f((float)x, (float)y));
                 ++ncorners;
 
@@ -289,18 +291,24 @@ test_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     {
         for( i = 0; i < total; i++ )
         {
+            cornersQuality.push_back(*tmpCorners[i]);
+
             int ofs = (int)((const uchar*)tmpCorners[i] - eig.data);
             int y = (int)(ofs / eig.step);
             int x = (int)((ofs - y*eig.step)/sizeof(float));
 
             corners.push_back(Point2f((float)x, (float)y));
             ++ncorners;
+
             if( maxCorners > 0 && (int)ncorners == maxCorners )
                 break;
         }
     }
 
     Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
+    if (_cornersQuality.needed()) {
+        Mat(cornersQuality).convertTo(_cornersQuality, _cornersQuality.fixedType() ? _cornersQuality.type() : CV_32F);
+    }
 
 }
 
@@ -325,6 +333,8 @@ protected:
     int maxCorners;
     vector<Point2f> corners;
     vector<Point2f> Refcorners;
+    vector<float> cornersQuality;
+    vector<float> RefcornersQuality;
     double qualityLevel;
     double minDistance;
     int blockSize;
@@ -396,6 +406,7 @@ void CV_GoodFeatureToTTest::run_func()
                qualityLevel,
                minDistance,
                Mat(),
+               cornersQuality,
                blockSize,
                gradientSize,
                useHarrisDetector,
@@ -414,6 +425,7 @@ void CV_GoodFeatureToTTest::run_func()
                qualityLevel,
                minDistance,
                Mat(),
+               cornersQuality,
                blockSize,
                gradientSize,
                useHarrisDetector,
@@ -439,6 +451,7 @@ int CV_GoodFeatureToTTest::validate_test_results( int test_case_idx )
                qualityLevel,
                minDistance,
                Mat(),
+               RefcornersQuality,
                blockSize,
                gradientSize,
                useHarrisDetector,
@@ -457,6 +470,7 @@ int CV_GoodFeatureToTTest::validate_test_results( int test_case_idx )
                qualityLevel,
                minDistance,
                Mat(),
+               RefcornersQuality,
                blockSize,
                gradientSize,
                useHarrisDetector,
@@ -471,7 +485,7 @@ int CV_GoodFeatureToTTest::validate_test_results( int test_case_idx )
         TEST_MESSAGEL ("                    TestCorners = ", corners.size())
         TEST_MESSAGE ("\n")
 
-        ts->printf(cvtest::TS::CONSOLE, "actual error: %g, expected: %g", e, eps);
+        EXPECT_LE(e, eps); // never true
         ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
 
         for(int i = 0; i < (int)std::min((unsigned int)(corners.size()), (unsigned int)(Refcorners.size())); i++){
@@ -488,6 +502,19 @@ int CV_GoodFeatureToTTest::validate_test_results( int test_case_idx )
         ts->set_failed_test_info(cvtest::TS::OK);
     }
 
+    e = cv::norm(cornersQuality, RefcornersQuality, NORM_RELATIVE | NORM_INF);
+
+    if (e > eps)
+    {
+        EXPECT_LE(e, eps); // never true
+        ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+
+        for(int i = 0; i < (int)std::min((unsigned int)(cornersQuality.size()), (unsigned int)(cornersQuality.size())); i++) {
+            if (std::abs(cornersQuality[i] - RefcornersQuality[i]) > eps * std::max(cornersQuality[i], RefcornersQuality[i]))
+                printf("i = %i Quality %2.6f Quality ref %2.6f\n", i, cornersQuality[i], RefcornersQuality[i]);
+        }
+    }
+
     return BaseTest::validate_test_results(test_case_idx);
 
 }
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 024c4f33fa..b2f0e68207 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1473,6 +1473,26 @@ TEST(Imgproc_Warp, multichannel)
     }
 }
 
+
+TEST(Imgproc_Warp, regression_19566)  // valgrind should detect problem if any
+{
+    const Size imgSize(8192, 8);
+
+    Mat inMat = Mat::zeros(imgSize, CV_8UC4);
+    Mat outMat = Mat::zeros(imgSize, CV_8UC4);
+
+    warpAffine(
+        inMat,
+        outMat,
+        getRotationMatrix2D(Point2f(imgSize.width / 2.0f, imgSize.height / 2.0f), 45.0, 1.0),
+        imgSize,
+        INTER_LINEAR,
+        cv::BORDER_CONSTANT,
+        cv::Scalar(0.0, 0.0, 0.0, 255.0)
+    );
+}
+
+
 TEST(Imgproc_GetAffineTransform, singularity)
 {
     Point2f A_sample[3];
diff --git a/modules/imgproc/test/test_intelligent_scissors.cpp b/modules/imgproc/test/test_intelligent_scissors.cpp
new file mode 100644
index 0000000000..c6b51fd6b6
--- /dev/null
+++ b/modules/imgproc/test/test_intelligent_scissors.cpp
@@ -0,0 +1,467 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+//#include "opencv2/imgproc/segmentation.hpp"
+
+namespace opencv_test { namespace {
+
+
+Mat getTestImageGray()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m = imread(findDataFile("shared/lena.png"), IMREAD_GRAYSCALE);
+    }
+    return m.clone();
+}
+
+Mat getTestImageColor()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m = imread(findDataFile("shared/lena.png"), IMREAD_COLOR);
+    }
+    return m.clone();
+}
+
+Mat getTestImage1()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m.create(Size(200, 100), CV_8UC1);
+        m.setTo(Scalar::all(128));
+        Rect roi(50, 30, 100, 40);
+        m(roi).setTo(Scalar::all(0));
+#if 0
+        imshow("image", m);
+        waitKey();
+#endif
+    }
+    return m.clone();
+}
+
+Mat getTestImage2()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m.create(Size(200, 100), CV_8UC1);
+        m.setTo(Scalar::all(128));
+        Rect roi(40, 30, 100, 40);
+        m(roi).setTo(Scalar::all(255));
+#if 0
+        imshow("image", m);
+        waitKey();
+#endif
+    }
+    return m.clone();
+}
+
+Mat getTestImage3()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m.create(Size(200, 100), CV_8UC1);
+        m.setTo(Scalar::all(128));
+        Scalar color(0,0,0,0);
+        line(m, Point(30, 50), Point(50, 50), color, 1);
+        line(m, Point(50, 50), Point(80, 30), color, 1);
+        line(m, Point(150, 50), Point(80, 30), color, 1);
+        line(m, Point(150, 50), Point(180, 50), color, 1);
+
+        line(m, Point(80, 10), Point(80, 90), Scalar::all(200), 1);
+        line(m, Point(100, 10), Point(100, 90), Scalar::all(200), 1);
+        line(m, Point(120, 10), Point(120, 90), Scalar::all(200), 1);
+#if 0
+        imshow("image", m);
+        waitKey();
+#endif
+    }
+    return m.clone();
+}
+
+Mat getTestImage4()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m.create(Size(200, 100), CV_8UC1);
+        for (int y = 0; y < m.rows; y++)
+        {
+            for (int x = 0; x < m.cols; x++)
+            {
+                float dx = (float)(x - 100);
+                float dy = (float)(y - 100);
+                float d = sqrtf(dx * dx + dy * dy);
+                m.at<uchar>(y, x) = saturate_cast<uchar>(100 + 100 * sin(d / 10 * CV_PI));
+            }
+        }
+#if 0
+        imshow("image", m);
+        waitKey();
+#endif
+    }
+    return m.clone();
+}
+
+Mat getTestImage5()
+{
+    static Mat m;
+    if (m.empty())
+    {
+        m.create(Size(200, 100), CV_8UC1);
+        for (int y = 0; y < m.rows; y++)
+        {
+            for (int x = 0; x < m.cols; x++)
+            {
+                float dx = (float)(x - 100);
+                float dy = (float)(y - 100);
+                float d = sqrtf(dx * dx + dy * dy);
+                m.at<uchar>(y, x) = saturate_cast<uchar>(x / 2 + 100 * sin(d / 10 * CV_PI));
+            }
+        }
+#if 0
+        imshow("image", m);
+        waitKey();
+#endif
+    }
+    return m.clone();
+}
+
+void show(const Mat& img, const std::vector<Point> pts)
+{
+    if (cvtest::debugLevel >= 10)
+    {
+        Mat dst = img.clone();
+        std::vector< std::vector<Point> > contours;
+        contours.push_back(pts);
+        polylines(dst, contours, false, Scalar::all(255));
+        imshow("dst", dst);
+        waitKey();
+    }
+}
+
+TEST(Imgproc_IntelligentScissorsMB, rect)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    tool.applyImage(getTestImage1());
+
+    Point source_point(50, 30);
+    tool.buildMap(source_point);
+
+    Point target_point(100, 30);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    tool.applyImage(getTestImage2());
+
+    tool.buildMap(source_point);
+
+    std::vector<Point> pts2;
+    tool.getContour(target_point, pts2, true/*backward*/);
+
+    EXPECT_EQ(pts.size(), pts2.size());
+}
+
+TEST(Imgproc_IntelligentScissorsMB, lines)
+{
+    segmentation::IntelligentScissorsMB tool;
+    Mat image = getTestImage3();
+    tool.applyImage(image);
+
+    Point source_point(30, 50);
+    tool.buildMap(source_point);
+
+    Point target_point(150, 50);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    EXPECT_EQ((size_t)121, pts.size());
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, circles)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setGradientMagnitudeMaxLimit(10);
+
+    Mat image = getTestImage4();
+    tool.applyImage(image);
+
+    Point source_point(50, 50);
+    tool.buildMap(source_point);
+
+    Point target_point(150, 50);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    EXPECT_EQ((size_t)101, pts.size());
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, circles_gradient)
+{
+    segmentation::IntelligentScissorsMB tool;
+    Mat image = getTestImage5();
+    tool.applyImage(image);
+
+    Point source_point(50, 50);
+    tool.buildMap(source_point);
+
+    Point target_point(150, 50);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    EXPECT_EQ((size_t)101, pts.size());
+    show(image, pts);
+}
+
+#define PTS_SIZE_EPS 2
+
+TEST(Imgproc_IntelligentScissorsMB, grayscale)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    Mat image = getTestImageGray();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 206;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, check_features_grayscale_1_0_0_zerro_crossing_with_limit)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setEdgeFeatureZeroCrossingParameters(64);
+    tool.setWeights(1.0f, 0.0f, 0.0f);
+
+    Mat image = getTestImageGray();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 207;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, check_features_grayscale_1_0_0_canny)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setEdgeFeatureCannyParameters(50, 100);
+    tool.setWeights(1.0f, 0.0f, 0.0f);
+
+    Mat image = getTestImageGray();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 201;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, check_features_grayscale_0_1_0)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setWeights(0.0f, 1.0f, 0.0f);
+
+    Mat image = getTestImageGray();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 166;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, check_features_grayscale_0_0_1)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setWeights(0.0f, 0.0f, 1.0f);
+
+    Mat image = getTestImageGray();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 197;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, color)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    Mat image = getTestImageColor();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 205;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, color_canny)
+{
+    segmentation::IntelligentScissorsMB tool;
+    tool.setEdgeFeatureCannyParameters(32, 100);
+
+    Mat image = getTestImageColor();
+    tool.applyImage(image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 200;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+
+TEST(Imgproc_IntelligentScissorsMB, color_custom_features_invalid)
+{
+    segmentation::IntelligentScissorsMB tool;
+    ASSERT_ANY_THROW(tool.applyImageFeatures(noArray(), noArray(), noArray()));
+}
+
+TEST(Imgproc_IntelligentScissorsMB, color_custom_features_edge)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    Mat image = getTestImageColor();
+
+    Mat canny_edges;
+    Canny(image, canny_edges, 32, 100, 5);
+    Mat binary_edge_feature;
+    cv::threshold(canny_edges, binary_edge_feature, 254, 1, THRESH_BINARY_INV);
+    tool.applyImageFeatures(binary_edge_feature, noArray(), noArray(), image);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 201;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, color_custom_features_all)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    tool.setWeights(0.9f, 0.0f, 0.1f);
+
+    Mat image = getTestImageColor();
+
+    Mat canny_edges;
+    Canny(image, canny_edges, 50, 100, 5);
+    Mat binary_edge_feature; // 0, 1 values
+    cv::threshold(canny_edges, binary_edge_feature, 254, 1, THRESH_BINARY_INV);
+
+    Mat_<Point2f> gradient_direction(image.size(), Point2f(0, 0));  // normalized
+    Mat_<float> gradient_magnitude(image.size(), 0);  // cost function
+    tool.applyImageFeatures(binary_edge_feature, gradient_direction, gradient_magnitude);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 201;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+TEST(Imgproc_IntelligentScissorsMB, color_custom_features_edge_magnitude)
+{
+    segmentation::IntelligentScissorsMB tool;
+
+    tool.setWeights(0.9f, 0.0f, 0.1f);
+
+    Mat image = getTestImageColor();
+
+    Mat canny_edges;
+    Canny(image, canny_edges, 50, 100, 5);
+    Mat binary_edge_feature; // 0, 1 values
+    cv::threshold(canny_edges, binary_edge_feature, 254, 1, THRESH_BINARY_INV);
+
+    Mat_<float> gradient_magnitude(image.size(), 0);  // cost function
+    tool.applyImageFeatures(binary_edge_feature, noArray(), gradient_magnitude);
+
+    Point source_point(275, 63);
+    tool.buildMap(source_point);
+
+    Point target_point(413, 155);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+
+    size_t gold = 201;
+    EXPECT_GE(pts.size(), gold - PTS_SIZE_EPS);
+    EXPECT_LE(pts.size(), gold + PTS_SIZE_EPS);
+    show(image, pts);
+}
+
+
+}} // namespace
diff --git a/modules/imgproc/test/test_intersection.cpp b/modules/imgproc/test/test_intersection.cpp
index 93909b3a9e..7527dd9a22 100644
--- a/modules/imgproc/test/test_intersection.cpp
+++ b/modules/imgproc/test/test_intersection.cpp
@@ -366,4 +366,29 @@ TEST(Imgproc_RotatedRectangleIntersection, regression_12221_2)
     EXPECT_LE(intersections.size(), (size_t)8);
 }
 
+TEST(Imgproc_RotatedRectangleIntersection, regression_18520)
+{
+    RotatedRect rr_empty(
+        Point2f(2, 2),
+        Size2f(0, 0),  // empty
+        0);
+    RotatedRect rr(
+        Point2f(50, 50),
+        Size2f(4, 4),
+        0);
+
+    {
+        std::vector<Point2f> intersections;
+        int interType = cv::rotatedRectangleIntersection(rr_empty, rr, intersections);
+        EXPECT_EQ(INTERSECT_NONE, interType) << "rr_empty, rr";
+        EXPECT_EQ((size_t)0, intersections.size()) << "rr_empty, rr";
+    }
+    {
+        std::vector<Point2f> intersections;
+        int interType = cv::rotatedRectangleIntersection(rr, rr_empty, intersections);
+        EXPECT_EQ(INTERSECT_NONE, interType) << "rr, rr_empty";
+        EXPECT_EQ((size_t)0, intersections.size()) << "rr, rr_empty";
+    }
+}
+
 }} // namespace
diff --git a/modules/java/android_sdk/CMakeLists.txt b/modules/java/android_sdk/CMakeLists.txt
index 1f50f84339..b3308c03f6 100644
--- a/modules/java/android_sdk/CMakeLists.txt
+++ b/modules/java/android_sdk/CMakeLists.txt
@@ -96,7 +96,11 @@ FILE(MAKE_DIRECTORY \"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${JAVA_INSTALL_ROOT
 
 ocv_update(ANDROID_COMPILE_SDK_VERSION "27")
 ocv_update(ANDROID_MIN_SDK_VERSION "14")
-ocv_update(ANDROID_TARGET_SDK_VERSION "21")
+if(ANDROID_NATIVE_API_LEVEL GREATER 21)
+  ocv_update(ANDROID_TARGET_SDK_VERSION "${ANDROID_NATIVE_API_LEVEL}")
+else()
+  ocv_update(ANDROID_TARGET_SDK_VERSION "21")
+endif()
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.gradle.in" "${CMAKE_CURRENT_BINARY_DIR}/build.gradle" @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/build.gradle" DESTINATION ${JAVA_INSTALL_ROOT}/.. COMPONENT java)
 
diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
index 5eac339c12..017de7f26a 100644
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@@ -43,17 +43,17 @@ public class JavaCamera2View extends CameraBridgeViewBase {
 
     private static final String LOGTAG = "JavaCamera2View";
 
-    private ImageReader mImageReader;
-    private int mPreviewFormat = ImageFormat.YUV_420_888;
+    protected ImageReader mImageReader;
+    protected int mPreviewFormat = ImageFormat.YUV_420_888;
 
-    private CameraDevice mCameraDevice;
-    private CameraCaptureSession mCaptureSession;
-    private CaptureRequest.Builder mPreviewRequestBuilder;
-    private String mCameraID;
-    private android.util.Size mPreviewSize = new android.util.Size(-1, -1);
+    protected CameraDevice mCameraDevice;
+    protected CameraCaptureSession mCaptureSession;
+    protected CaptureRequest.Builder mPreviewRequestBuilder;
+    protected String mCameraID;
+    protected android.util.Size mPreviewSize = new android.util.Size(-1, -1);
 
     private HandlerThread mBackgroundThread;
-    private Handler mBackgroundHandler;
+    protected Handler mBackgroundHandler;
 
     public JavaCamera2View(Context context, int cameraId) {
         super(context, cameraId);
diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp
index 5203413ae4..d59fe4a506 100644
--- a/modules/java/generator/src/cpp/Mat.cpp
+++ b/modules/java/generator/src/cpp/Mat.cpp
@@ -2129,80 +2129,83 @@ namespace {
 #undef JOCvT
 }
 
-template<typename T> static int mat_put(cv::Mat* m, int row, int col, int count, int offset, char* buff)
-{
-    if(! m) return 0;
-    if(! buff) return 0;
-
-    count *= sizeof(T);
-    int rest = ((m->rows - row) * m->cols - col) * (int)m->elemSize();
-    if(count>rest) count = rest;
-    int res = count;
-
-    if( m->isContinuous() )
-    {
-        memcpy(m->ptr(row, col), buff + offset, count);
-    } else {
-        // row by row
-        int num = (m->cols - col) * (int)m->elemSize(); // 1st partial row
-        if(count<num) num = count;
-        uchar* data = m->ptr(row++, col);
-        while(count>0){
-            memcpy(data, buff + offset, num);
-            count -= num;
-            buff += num;
-            num = m->cols * (int)m->elemSize();
-            if(count<num) num = count;
-            data = m->ptr(row++, 0);
-        }
+static size_t idx2Offset(cv::Mat* mat, std::vector<int>& indices) {
+    size_t offset = indices[0];
+    for (int dim=1; dim < mat->dims; dim++) {
+        offset = offset*mat->size[dim] + indices[dim];
+    }
+    return offset;
+}
+
+static void offset2Idx(cv::Mat* mat, size_t offset, std::vector<int>& indices) {
+    for (int dim=mat->dims-1; dim>=0; dim--) {
+        indices[dim] = offset % mat->size[dim];
+        offset = (offset - indices[dim]) / mat->size[dim];
     }
-    return res;
 }
 
 // returns true if final index was reached
-static bool updateIdx(cv::Mat* m, std::vector<int>& idx, int inc) {
-    for (int i=m->dims-1; i>=0; i--) {
-        if (inc == 0) return false;
-        idx[i] = (idx[i] + 1) % m->size[i];
-        inc--;
-    }
-    return true;
+static bool updateIdx(cv::Mat* mat, std::vector<int>& indices, size_t inc) {
+    size_t currentOffset = idx2Offset(mat, indices);
+    size_t newOffset = currentOffset + inc;
+    bool reachedEnd = newOffset>=(size_t)mat->total();
+    offset2Idx(mat, reachedEnd?0:newOffset, indices);
+    return reachedEnd;
 }
 
-template<typename T> static int mat_put_idx(cv::Mat* m, std::vector<int>& idx, int count, int offset, char* buff)
-{
+template<typename T> static int mat_copy_data(cv::Mat* m, std::vector<int>& idx, int count, char* buff, bool isPut) {
     if(! m) return 0;
     if(! buff) return 0;
 
-    count *= sizeof(T);
-    int rest = (int)m->elemSize();
-    for (int i = 0; i < m->dims; i++) {
-        rest *= (m->size[i] - idx[i]);
-    }
-    if(count>rest) count = rest;
-    int res = count;
+    size_t countBytes = count * sizeof(T);
+    size_t remainingBytes = (size_t)(m->total() - idx2Offset(m, idx))*m->elemSize();
+    countBytes = (countBytes>remainingBytes)?remainingBytes:countBytes;
+    int res = (int)countBytes;
 
     if( m->isContinuous() )
     {
-        memcpy(m->ptr(idx.data()), buff + offset, count);
+        if (isPut) {
+            memcpy(m->ptr(idx.data()), buff, countBytes);
+        } else {
+            memcpy(buff, m->ptr(idx.data()), countBytes);
+        }
     } else {
-        // dim by dim
-        int num = (m->size[m->dims-1] - idx[m->dims-1]) * (int)m->elemSize(); // 1st partial row
-        if(count<num) num = count;
+        size_t blockSize = m->size[m->dims-1] * m->elemSize();
+        size_t firstPartialBlockSize = (m->size[m->dims-1] - idx[m->dims-1]) * m->step[m->dims-1];;
+        for (int dim=m->dims-2; dim>=0 && blockSize == m->step[dim]; dim--) {
+            blockSize *= m->size[dim];
+            firstPartialBlockSize += (m->size[dim] - (idx[dim]+1)) * m->step[dim];
+        }
+        size_t copyCount = (countBytes<firstPartialBlockSize)?countBytes:firstPartialBlockSize;
         uchar* data = m->ptr(idx.data());
-        while(count>0){
-            memcpy(data, buff + offset, num);
-            updateIdx(m, idx, num / (int)m->elemSize());
-            count -= num;
-            buff += num;
-            num = m->size[m->dims-1] * (int)m->elemSize();
-            if(count<num) num = count;
+        while(countBytes>0){
+            if (isPut) {
+                memcpy(data, buff, copyCount);
+            } else {
+                memcpy(buff, data, copyCount);
+            }
+            updateIdx(m, idx, copyCount / m->elemSize());
+            countBytes -= copyCount;
+            buff += copyCount;
+            copyCount = countBytes<blockSize?countBytes:blockSize;
             data = m->ptr(idx.data());
         }
     }
     return res;
 }
 
+template<typename T> static int mat_put_idx(cv::Mat* m, std::vector<int>& idx, int count, int offset, char* buff)
+{
+    return mat_copy_data<T>(m, idx, count, buff + offset, true);
+}
+
+template<typename T> static int mat_put(cv::Mat* m, int row, int col, int count, int offset, char* buff)
+{
+    int indicesArray[] = { row, col };
+    std::vector<int> indices(indicesArray, indicesArray+2);
+    return mat_put_idx<T>(m, indices, count, offset, buff);
+}
+
 template<class ARRAY> static jint java_mat_put(JNIEnv* env, jlong self, jint row, jint col, jint count, jint offset, ARRAY vals)
 {
     static const char *method_name = JavaOpenCVTrait<ARRAY>::put;
@@ -2455,68 +2458,16 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutDIdx
 
 } // extern "C"
 
-template<typename T> static int mat_get(cv::Mat* m, int row, int col, int count, char* buff)
-{
-    if(! m) return 0;
-    if(! buff) return 0;
-
-    int bytesToCopy = count * sizeof(T);
-    int bytesRestInMat = ((m->rows - row) * m->cols - col) * (int)m->elemSize();
-    if(bytesToCopy > bytesRestInMat) bytesToCopy = bytesRestInMat;
-    int res = bytesToCopy;
-
-    if( m->isContinuous() )
-    {
-        memcpy(buff, m->ptr(row, col), bytesToCopy);
-    } else {
-        // row by row
-        int bytesInRow = (m->cols - col) * (int)m->elemSize(); // 1st partial row
-        while(bytesToCopy > 0)
-        {
-            int len = std::min(bytesToCopy, bytesInRow);
-            memcpy(buff, m->ptr(row, col), len);
-            bytesToCopy -= len;
-            buff += len;
-            row++;
-            col = 0;
-            bytesInRow = m->cols * (int)m->elemSize();
-        }
-    }
-    return res;
-}
-
 template<typename T> static int mat_get_idx(cv::Mat* m, std::vector<int>& idx, int count, char* buff)
 {
-    if(! m) return 0;
-    if(! buff) return 0;
+    return mat_copy_data<T>(m, idx, count, buff, false);
+}
 
-    count *= sizeof(T);
-    int rest = (int)m->elemSize();
-    for (int i = 0; i < m->dims; i++) {
-        rest *= (m->size[i] - idx[i]);
-    }
-    if(count>rest) count = rest;
-    int res = count;
-
-    if( m->isContinuous() )
-    {
-        memcpy(buff, m->ptr(idx.data()), count);
-    } else {
-        // dim by dim
-        int num = (m->size[m->dims-1] - idx[m->dims-1]) * (int)m->elemSize(); // 1st partial row
-        if(count<num) num = count;
-        uchar* data = m->ptr(idx.data());
-        while(count>0){
-            memcpy(buff, data, num);
-            updateIdx(m, idx, num / (int)m->elemSize());
-            count -= num;
-            buff += num;
-            num = m->size[m->dims-1] * (int)m->elemSize();
-            if(count<num) num = count;
-            data = m->ptr(idx.data());
-        }
-    }
-    return res;
+template<typename T> static int mat_get(cv::Mat* m, int row, int col, int count, char* buff)
+{
+    int indicesArray[] = { row, col };
+    std::vector<int> indices(indicesArray, indicesArray+2);
+    return mat_get_idx<T>(m, indices, count, buff);
 }
 
 template<class ARRAY> static jint java_mat_get(JNIEnv* env, jlong self, jint row, jint col, jint count, ARRAY vals) {
diff --git a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
index 5c6432c9f2..802bb2daa4 100644
--- a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
+++ b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
@@ -279,19 +279,23 @@ public class OpenCVTestCase extends TestCase {
     }
 
     public static <E extends Number> void assertArrayEquals(E[] ar1, E[] ar2, double epsilon) {
-        if (ar1.length != ar2.length) {
-            fail("Arrays have different sizes.");
-        }
+        assertEquals(ar1.length, ar2.length);
 
         for (int i = 0; i < ar1.length; i++)
             assertEquals(ar1[i].doubleValue(), ar2[i].doubleValue(), epsilon);
             //assertTrue(Math.abs(ar1[i].doubleValue() - ar2[i].doubleValue()) <= epsilon);
     }
 
+    public static void assertArrayEquals(short[] ar1, short[] ar2) {
+        assertEquals(ar1.length, ar2.length);
+
+        for (int i = 0; i < ar1.length; i++)
+            assertEquals(ar1[i], ar2[i]);
+            //assertTrue(Math.abs(ar1[i].doubleValue() - ar2[i].doubleValue()) <= epsilon);
+    }
+
     public static void assertArrayEquals(double[] ar1, double[] ar2, double epsilon) {
-        if (ar1.length != ar2.length) {
-            fail("Arrays have different sizes.");
-        }
+        assertEquals(ar1.length, ar2.length);
 
         for (int i = 0; i < ar1.length; i++)
             assertEquals(ar1[i], ar2[i], epsilon);
diff --git a/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java b/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java
index 5b303f3836..3fd918dbfe 100644
--- a/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java
+++ b/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java
@@ -305,19 +305,22 @@ public class OpenCVTestCase extends TestCase {
     }
 
     public static <E extends Number> void assertArrayEquals(E[] ar1, E[] ar2, double epsilon) {
-        if (ar1.length != ar2.length) {
-            fail("Arrays have different sizes.");
-        }
+        assertEquals(ar1.length, ar2.length);
 
         for (int i = 0; i < ar1.length; i++)
             assertEquals(ar1[i].doubleValue(), ar2[i].doubleValue(), epsilon);
             //assertTrue(Math.abs(ar1[i].doubleValue() - ar2[i].doubleValue()) <= epsilon);
     }
 
+    public static void assertArrayEquals(short[] ar1, short[] ar2) {
+        assertEquals(ar1.length, ar2.length);
+
+        for (int i = 0; i < ar1.length; i++)
+            assertEquals(ar1[i], ar2[i]);
+    }
+
     public static void assertArrayEquals(double[] ar1, double[] ar2, double epsilon) {
-        if (ar1.length != ar2.length) {
-            fail("Arrays have different sizes.");
-        }
+        assertEquals(ar1.length, ar2.length);
 
         for (int i = 0; i < ar1.length; i++)
             assertEquals(ar1[i], ar2[i], epsilon);
diff --git a/modules/js/CMakeLists.txt b/modules/js/CMakeLists.txt
index c905c7bd5c..5996e419dd 100644
--- a/modules/js/CMakeLists.txt
+++ b/modules/js/CMakeLists.txt
@@ -1,13 +1,19 @@
 # ----------------------------------------------------------------------------
 #  CMake file for js support
 # ----------------------------------------------------------------------------
-set(the_description "The js bindings")
-
-if(NOT BUILD_opencv_js)  # should be enabled explicitly (by build_js.py script)
-  ocv_module_disable(js)
+if(OPENCV_INITIAL_PASS)
+  # generator for Objective-C source code and documentation signatures
+  add_subdirectory(generator)
 endif()
 
+if(NOT BUILD_opencv_js)  # should be enabled explicitly (by build_js.py script)
+  return()
+endif()
+
+set(the_description "The JavaScript(JS) bindings")
+
 set(OPENCV_JS "opencv.js")
+set(JS_HELPER "${CMAKE_CURRENT_SOURCE_DIR}/src/helpers.js")
 
 find_path(EMSCRIPTEN_INCLUDE_DIR
           emscripten/bind.h
@@ -28,59 +34,18 @@ if(NOT EMSCRIPTEN_INCLUDE_DIR OR NOT PYTHON_DEFAULT_AVAILABLE)
   ocv_module_disable(js)
 endif()
 
-ocv_add_module(js BINDINGS)
+ocv_add_module(js BINDINGS PRIVATE_REQUIRED opencv_js_bindings_generator)
 
 ocv_module_include_directories(${EMSCRIPTEN_INCLUDE_DIR})
 
-# get list of modules to wrap
-# message(STATUS "Wrapped in js:")
-set(OPENCV_JS_MODULES)
-foreach(m ${OPENCV_MODULES_BUILD})
-  if(";${OPENCV_MODULE_${m}_WRAPPERS};" MATCHES ";js;" AND HAVE_${m})
-    list(APPEND OPENCV_JS_MODULES ${m})
-    # message(STATUS "\t${m}")
-  endif()
-endforeach()
-
-set(opencv_hdrs "")
-foreach(m ${OPENCV_JS_MODULES})
-  list(APPEND opencv_hdrs ${OPENCV_MODULE_${m}_HEADERS})
-endforeach(m)
-
-# header blacklist
-ocv_list_filterout(opencv_hdrs "modules/.*.h$")
-ocv_list_filterout(opencv_hdrs "modules/core/.*/cuda")
-ocv_list_filterout(opencv_hdrs "modules/core/.*/opencl")
-ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/opengl.hpp")
-ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/ocl.hpp")
-ocv_list_filterout(opencv_hdrs "modules/cuda.*")
-ocv_list_filterout(opencv_hdrs "modules/cudev")
-ocv_list_filterout(opencv_hdrs "modules/core/.*/hal/")
-ocv_list_filterout(opencv_hdrs "modules/.*/detection_based_tracker.hpp") # Conditional compilation
-ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/.*")
-
-file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/headers.txt" "${opencv_hdrs}")
-
-set(bindings_cpp "${CMAKE_CURRENT_BINARY_DIR}/bindings.cpp")
-
-set(scripts_hdr_parser "${CMAKE_CURRENT_SOURCE_DIR}/../python/src2/hdr_parser.py")
-
-set(JS_HELPER "${CMAKE_CURRENT_SOURCE_DIR}/src/helpers.js")
-
-add_custom_command(
-   OUTPUT ${bindings_cpp}
-   COMMAND ${PYTHON_DEFAULT_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/src/embindgen.py" ${scripts_hdr_parser} ${bindings_cpp} "${CMAKE_CURRENT_BINARY_DIR}/headers.txt" "${CMAKE_CURRENT_SOURCE_DIR}/src/core_bindings.cpp"
-   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/core_bindings.cpp
-   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/embindgen.py
-   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/templates.py
-   DEPENDS ${scripts_hdr_parser}
-   #(not needed - generated by CMake) DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/headers.txt
-   DEPENDS ${opencv_hdrs}
-   DEPENDS ${JS_HELPER})
-
 add_definitions("-std=c++11")
 
-link_libraries(${OPENCV_MODULE_${the_module}_DEPS})
+set(deps ${OPENCV_MODULE_${the_module}_DEPS})
+list(REMOVE_ITEM deps opencv_js_bindings_generator)  # don't add dummy module
+link_libraries(${deps})
+
+set(bindings_cpp "${OPENCV_JS_BINDINGS_DIR}/gen/bindings.cpp")
+set_source_files_properties(${bindings_cpp} PROPERTIES GENERATED TRUE)
 
 OCV_OPTION(BUILD_WASM_INTRIN_TESTS "Build WASM intrin tests" OFF )
 if(BUILD_WASM_INTRIN_TESTS)
@@ -94,6 +59,8 @@ else()
   ocv_add_executable(${the_module} ${bindings_cpp})
 endif()
 
+add_dependencies(${the_module} gen_opencv_js_source)
+
 set(COMPILE_FLAGS "")
 if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
     set(COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-missing-prototypes")
@@ -101,6 +68,7 @@ endif()
 if(COMPILE_FLAGS)
     set_target_properties(${the_module} PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS})
 endif()
+
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} --memory-init-file 0 -s TOTAL_MEMORY=128MB -s WASM_MEM_MAX=1GB -s ALLOW_MEMORY_GROWTH=1")
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s MODULARIZE=1 -s SINGLE_FILE=1")
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s EXPORT_NAME=\"'cv'\" -s DEMANGLE_SUPPORT=1")
@@ -155,7 +123,7 @@ add_custom_command(OUTPUT "${opencv_test_js_bin_dir}/${test_data}"
                   )
 list(APPEND opencv_test_js_file_deps "${test_data_path}" "${opencv_test_js_bin_dir}/${test_data}")
 
-add_custom_target(${PROJECT_NAME}_test ALL
+add_custom_target(${PROJECT_NAME}_test
                   DEPENDS ${OCV_JS_PATH} ${opencv_test_js_file_deps})
 
 # perf
@@ -178,7 +146,7 @@ foreach(f ${perf_files})
   list(APPEND opencv_perf_js_file_deps "${perf_dir}/${f}" "${opencv_perf_js_bin_dir}/${f}")
 endforeach()
 
-add_custom_target(${PROJECT_NAME}_perf ALL
+add_custom_target(${PROJECT_NAME}_perf
                   DEPENDS ${OCV_JS_PATH} ${opencv_perf_js_file_deps})
 
 #loader
@@ -198,4 +166,6 @@ add_custom_command(
 list(APPEND opencv_loader_js_file_deps "${loader_dir}/loader.js" "${opencv_loader_js_bin_dir}/loader.js")
 
 add_custom_target(${PROJECT_NAME}_loader ALL
-                  DEPENDS ${OCV_JS_PATH} ${opencv_loader_js_file_deps})
\ No newline at end of file
+                  DEPENDS ${OCV_JS_PATH} ${opencv_loader_js_file_deps})
+
+add_custom_target(opencv_test_js ALL DEPENDS opencv_js_test opencv_js_perf opencv_js_loader)
diff --git a/modules/js/common.cmake b/modules/js/common.cmake
new file mode 100644
index 0000000000..192bcca4ea
--- /dev/null
+++ b/modules/js/common.cmake
@@ -0,0 +1,13 @@
+# get list of modules to wrap
+if(HAVE_opencv_js)
+  message(STATUS "Wrapped in JavaScript(js):")
+endif()
+set(OPENCV_JS_MODULES "")
+foreach(m ${OPENCV_MODULES_BUILD})
+  if(";${OPENCV_MODULE_${m}_WRAPPERS};" MATCHES ";js;" AND HAVE_${m})
+    list(APPEND OPENCV_JS_MODULES ${m})
+    if(HAVE_opencv_js)
+      message(STATUS "    ${m}")
+    endif()
+  endif()
+endforeach()
diff --git a/modules/js/generator/CMakeLists.txt b/modules/js/generator/CMakeLists.txt
new file mode 100644
index 0000000000..7a53429651
--- /dev/null
+++ b/modules/js/generator/CMakeLists.txt
@@ -0,0 +1,76 @@
+set(MODULE_NAME "js_bindings_generator")
+set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
+ocv_add_module(${MODULE_NAME} INTERNAL)
+
+set(OPENCV_JS_BINDINGS_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE INTERNAL "")
+file(REMOVE_RECURSE "${OPENCV_JS_BINDINGS_DIR}/gen")
+file(MAKE_DIRECTORY "${OPENCV_JS_BINDINGS_DIR}/gen")
+file(REMOVE "${OPENCV_DEPHELPER}/gen_opencv_js_source")  # force re-run after CMake
+
+# This file is included from a subdirectory
+set(JS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..")
+include(${JS_SOURCE_DIR}/common.cmake)  # fill OPENCV_JS_MODULES
+
+set(opencv_hdrs "")
+foreach(m ${OPENCV_JS_MODULES})
+  list(APPEND opencv_hdrs ${OPENCV_MODULE_${m}_HEADERS})
+endforeach(m)
+
+# header blacklist
+ocv_list_filterout(opencv_hdrs "modules/.*.h$")
+ocv_list_filterout(opencv_hdrs "modules/core/.*/cuda")
+ocv_list_filterout(opencv_hdrs "modules/core/.*/opencl")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/opengl.hpp")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/ocl.hpp")
+ocv_list_filterout(opencv_hdrs "modules/cuda.*")
+ocv_list_filterout(opencv_hdrs "modules/cudev")
+ocv_list_filterout(opencv_hdrs "modules/core/.*/hal/")
+ocv_list_filterout(opencv_hdrs "modules/.*/detection_based_tracker.hpp") # Conditional compilation
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/*.private.*")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/instrumentation.hpp")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/trace*")
+
+ocv_update_file("${CMAKE_CURRENT_BINARY_DIR}/headers.txt" "${opencv_hdrs}")
+
+set(bindings_cpp "${OPENCV_JS_BINDINGS_DIR}/gen/bindings.cpp")
+
+set(scripts_hdr_parser "${JS_SOURCE_DIR}/../python/src2/hdr_parser.py")
+
+if(DEFINED ENV{OPENCV_JS_WHITELIST})
+  set(OPENCV_JS_WHITELIST_FILE "$ENV{OPENCV_JS_WHITELIST}")
+else()
+  set(OPENCV_JS_WHITELIST_FILE "${OpenCV_SOURCE_DIR}/platforms/js/opencv_js.config.py")
+endif()
+
+add_custom_command(
+  OUTPUT ${bindings_cpp} "${OPENCV_DEPHELPER}/gen_opencv_js_source"
+  COMMAND
+      ${PYTHON_DEFAULT_EXECUTABLE}
+      "${CMAKE_CURRENT_SOURCE_DIR}/embindgen.py"
+      "${scripts_hdr_parser}"
+      "${bindings_cpp}"
+      "${CMAKE_CURRENT_BINARY_DIR}/headers.txt"
+      "${JS_SOURCE_DIR}/src/core_bindings.cpp"
+      "${OPENCV_JS_WHITELIST_FILE}"
+  COMMAND
+      ${CMAKE_COMMAND} -E touch "${OPENCV_DEPHELPER}/gen_opencv_js_source"
+  WORKING_DIRECTORY
+      "${CMAKE_CURRENT_BINARY_DIR}/gen"
+  DEPENDS
+      ${JS_SOURCE_DIR}/src/core_bindings.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/embindgen.py
+      ${CMAKE_CURRENT_SOURCE_DIR}/templates.py
+      ${scripts_hdr_parser}
+      #(not needed - generated by CMake) ${CMAKE_CURRENT_BINARY_DIR}/headers.txt
+      ${opencv_hdrs}
+  COMMENT "Generate source files for JavaScript bindings"
+)
+
+add_custom_target(gen_opencv_js_source
+  # excluded from all: ALL
+  DEPENDS ${bindings_cpp} "${OPENCV_DEPHELPER}/gen_opencv_js_source"
+  SOURCES
+      ${JS_SOURCE_DIR}/src/core_bindings.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/embindgen.py
+      ${CMAKE_CURRENT_SOURCE_DIR}/templates.py
+)
diff --git a/modules/js/src/embindgen.py b/modules/js/generator/embindgen.py
similarity index 97%
rename from modules/js/src/embindgen.py
rename to modules/js/generator/embindgen.py
index 0ec4488946..dc7a001df1 100644
--- a/modules/js/src/embindgen.py
+++ b/modules/js/generator/embindgen.py
@@ -104,8 +104,6 @@ def makeWhiteList(module_list):
     return wl
 
 white_list = None
-exec(open(os.environ["OPENCV_JS_WHITELIST"]).read())
-assert(white_list)
 
 # Features to be exported
 export_enums = False
@@ -121,6 +119,7 @@ type_dict = {
     'InputOutputArray': 'cv::Mat&',
     'InputArrayOfArrays': 'const std::vector<cv::Mat>&',
     'OutputArrayOfArrays': 'std::vector<cv::Mat>&',
+    'string': 'std::string',
     'String': 'std::string',
     'const String&':'const std::string&'
 }
@@ -464,8 +463,7 @@ class JSWrapperGenerator(object):
                     ret_type = type_dict[ptr_type]
             for key in type_dict:
                 if key in ret_type:
-                    ret_type = ret_type.replace(key, type_dict[key])
-
+                    ret_type = re.sub('(^|[^\w])' + key + '($|[^\w])', type_dict[key], ret_type)
             arg_types = []
             unwrapped_arg_types = []
             for arg in variant.args:
@@ -569,7 +567,7 @@ class JSWrapperGenerator(object):
                         # consider the default parameter variants
                         args_num = len(variant.args) - j
                         if args_num in class_info.constructor_arg_num:
-                            # FIXME: workaournd for constructor overload with same args number
+                            # FIXME: workaround for constructor overload with same args number
                             # e.g. DescriptorMatcher
                             continue
                         class_info.constructor_arg_num.add(args_num)
@@ -629,15 +627,16 @@ class JSWrapperGenerator(object):
             ret_type = 'void' if variant.rettype.strip() == '' else variant.rettype
 
             ret_type = ret_type.strip()
-
             if ret_type.startswith('Ptr'): #smart pointer
                 ptr_type = ret_type.replace('Ptr<', '').replace('>', '')
                 if ptr_type in type_dict:
                     ret_type = type_dict[ptr_type]
             for key in type_dict:
                 if key in ret_type:
-                    ret_type = ret_type.replace(key, type_dict[key])
-
+                    # Replace types. Instead of ret_type.replace we use regular
+                    # expression to exclude false matches.
+                    # See https://github.com/opencv/opencv/issues/15514
+                    ret_type = re.sub('(^|[^\w])' + key + '($|[^\w])', type_dict[key], ret_type)
             if variant.constret and ret_type.startswith('const') == False:
                 ret_type = 'const ' + ret_type
             if variant.refret and ret_type.endswith('&') == False:
@@ -778,15 +777,13 @@ class JSWrapperGenerator(object):
                     self.bindings+=binding
 
         # generate code for the classes and their methods
-        class_list = list(self.classes.items())
-
-        for name, class_info in class_list:
+        for name, class_info in sorted(self.classes.items()):
             class_bindings = []
             if not name in white_list:
                 continue
 
             # Generate bindings for methods
-            for method_name, method in class_info.methods.items():
+            for method_name, method in sorted(class_info.methods.items()):
                 if method.cname in ignore_list:
                     continue
                 if not method.name in white_list[method.class_name]:
@@ -891,10 +888,10 @@ class JSWrapperGenerator(object):
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 4:
+    if len(sys.argv) < 5:
         print("Usage:\n", \
             os.path.basename(sys.argv[0]), \
-            "<full path to hdr_parser.py> <bindings.cpp> <headers.txt> <core_bindings.cpp>")
+            "<full path to hdr_parser.py> <bindings.cpp> <headers.txt> <core_bindings.cpp> <opencv_js.config.py>")
         print("Current args are: ", ", ".join(["'"+a+"'" for a in sys.argv]))
         exit(0)
 
@@ -908,5 +905,9 @@ if __name__ == "__main__":
     bindingsCpp = sys.argv[2]
     headers = open(sys.argv[3], 'r').read().split(';')
     coreBindings = sys.argv[4]
+    whiteListFile = sys.argv[5]
+    exec(open(whiteListFile).read())
+    assert(white_list)
+
     generator = JSWrapperGenerator()
     generator.gen(bindingsCpp, headers, coreBindings)
diff --git a/modules/js/src/templates.py b/modules/js/generator/templates.py
similarity index 100%
rename from modules/js/src/templates.py
rename to modules/js/generator/templates.py
diff --git a/modules/js/perf/base.js b/modules/js/perf/base.js
index 3948f21254..f44b8e7f20 100644
--- a/modules/js/perf/base.js
+++ b/modules/js/perf/base.js
@@ -1,5 +1,10 @@
 if (typeof window === 'undefined') {
   var cv = require("../opencv");
+  if (cv instanceof Promise) {
+    loadOpenCV();
+  } else {
+    cv.onRuntimeInitialized = perf;
+  }
 }
 
 let gCvSize;
@@ -24,6 +29,10 @@ function getCvSize() {
   return gCvSize;
 }
 
+async function loadOpenCV() {
+  cv = await cv;
+}
+
 if (typeof window === 'undefined') {
   exports.getCvSize = getCvSize;
 }
\ No newline at end of file
diff --git a/modules/js/perf/perf_helpfunc.js b/modules/js/perf/perf_helpfunc.js
index e42f4ad807..c2ad7f2e0f 100644
--- a/modules/js/perf/perf_helpfunc.js
+++ b/modules/js/perf/perf_helpfunc.js
@@ -1,3 +1,10 @@
+const isNodeJs = (typeof window) === 'undefined'? true : false;
+
+if(isNodeJs) {
+  var Base = require("./base");
+  global.getCvSize = Base.getCvSize;
+}
+
 var fillGradient = function(cv, img, delta=5) {
   let ch = img.channels();
   console.assert(!img.empty() && img.depth() == cv.CV_8U && ch <= 4);
@@ -56,8 +63,8 @@ var smoothBorder = function(cv, img, color, delta=5) {
 
 var cvtStr2cvSize = function(strSize) {
   let size;
-
   let cvSize = getCvSize();
+
   switch(strSize) {
     case "127,61": size = cvSize.szODD;break;
     case '320,240': size = cvSize.szQVGA;break;
diff --git a/modules/js/perf/perf_imgproc/perf_blur.js b/modules/js/perf/perf_imgproc/perf_blur.js
index 59712fb478..66c5f240e7 100644
--- a/modules/js/perf/perf_imgproc/perf_blur.js
+++ b/modules/js/perf/perf_imgproc/perf_blur.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_cvtcolor.js b/modules/js/perf/perf_imgproc/perf_cvtcolor.js
index b5007985cc..fbae5d1bca 100644
--- a/modules/js/perf/perf_imgproc/perf_cvtcolor.js
+++ b/modules/js/perf/perf_imgproc/perf_cvtcolor.js
@@ -17,7 +17,11 @@ function perf() {
   if (isNodeJs) {
     global.cv = cv;
     global.combine = HelpFunc.combine;
-    global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+    global.constructMode = HelpFunc.constructMode;
+    global.log = HelpFunc.log;
+    global.decodeParams2Case = HelpFunc.decodeParams2Case;
+    global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+    global.addKernelCase = HelpFunc.addKernelCase;
     global.cvSize = Base.getCvSize();
   } else {
     enableButton();
@@ -25,7 +29,7 @@ function perf() {
   }
   let totalCaseNum, currentCaseId;
 
-  //extra color conversions supported implicitly
+  // extra color conversions supported implicitly
   {
     cv.CX_BGRA2HLS      = cv.COLOR_COLORCVT_MAX + cv.COLOR_BGR2HLS,
     cv.CX_BGRA2HLS_FULL = cv.COLOR_COLORCVT_MAX + cv.COLOR_BGR2HLS_FULL,
diff --git a/modules/js/perf/perf_imgproc/perf_dilate.js b/modules/js/perf/perf_imgproc/perf_dilate.js
index c4e14c7be2..5b6cd01682 100644
--- a/modules/js/perf/perf_imgproc/perf_dilate.js
+++ b/modules/js/perf/perf_imgproc/perf_dilate.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_erode.js b/modules/js/perf/perf_imgproc/perf_erode.js
index 95aba6fa21..8915ead40f 100644
--- a/modules/js/perf/perf_imgproc/perf_erode.js
+++ b/modules/js/perf/perf_imgproc/perf_erode.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_filter2D.js b/modules/js/perf/perf_imgproc/perf_filter2D.js
index d92dc2b55a..4602befcbd 100644
--- a/modules/js/perf/perf_imgproc/perf_filter2D.js
+++ b/modules/js/perf/perf_imgproc/perf_filter2D.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_gaussianBlur.js b/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
index 33c5401a7e..b59aa83b84 100644
--- a/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_medianBlur.js b/modules/js/perf/perf_imgproc/perf_medianBlur.js
index 69b7ba3ead..333bc8424c 100644
--- a/modules/js/perf/perf_imgproc/perf_medianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_medianBlur.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_pyrDown.js b/modules/js/perf/perf_imgproc/perf_pyrDown.js
index a98b109ade..957ac7684d 100644
--- a/modules/js/perf/perf_imgproc/perf_pyrDown.js
+++ b/modules/js/perf/perf_imgproc/perf_pyrDown.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_remap.js b/modules/js/perf/perf_imgproc/perf_remap.js
index fe2e5d7541..1aa69ecef7 100644
--- a/modules/js/perf/perf_imgproc/perf_remap.js
+++ b/modules/js/perf/perf_imgproc/perf_remap.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_resize.js b/modules/js/perf/perf_imgproc/perf_resize.js
index 3eef30f0e3..5262d22489 100644
--- a/modules/js/perf/perf_imgproc/perf_resize.js
+++ b/modules/js/perf/perf_imgproc/perf_resize.js
@@ -17,7 +17,11 @@ function perf() {
   if (isNodeJs) {
     global.cv = cv;
     global.combine = HelpFunc.combine;
-    global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+    global.fillGradient = HelpFunc.fillGradient;
+    global.log = HelpFunc.log;
+    global.decodeParams2Case = HelpFunc.decodeParams2Case;
+    global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+    global.addKernelCase = HelpFunc.addKernelCase;
     global.cvSize = Base.getCvSize();
   } else {
     enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_scharr.js b/modules/js/perf/perf_imgproc/perf_scharr.js
index a76a93078c..4726e76312 100644
--- a/modules/js/perf/perf_imgproc/perf_scharr.js
+++ b/modules/js/perf/perf_imgproc/perf_scharr.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_sobel.js b/modules/js/perf/perf_imgproc/perf_sobel.js
index b7064e852a..ddc09bb8f6 100644
--- a/modules/js/perf/perf_imgproc/perf_sobel.js
+++ b/modules/js/perf/perf_imgproc/perf_sobel.js
@@ -17,7 +17,10 @@ function perf() {
     if (isNodeJs) {
       global.cv = cv;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase;
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_threshold.js b/modules/js/perf/perf_imgproc/perf_threshold.js
index 381ddaeade..629628748d 100644
--- a/modules/js/perf/perf_imgproc/perf_threshold.js
+++ b/modules/js/perf/perf_imgproc/perf_threshold.js
@@ -17,7 +17,10 @@ function perf() {
   if (isNodeJs) {
     global.cv = cv;
     global.combine = HelpFunc.combine;
-    global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+    global.log = HelpFunc.log;
+    global.decodeParams2Case = HelpFunc.decodeParams2Case;
+    global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+    global.addKernelCase = HelpFunc.addKernelCase;
     global.cvSize = Base.getCvSize();
   } else {
     enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_warpAffine.js b/modules/js/perf/perf_imgproc/perf_warpAffine.js
index c63cd60e61..dc3cf67af4 100644
--- a/modules/js/perf/perf_imgproc/perf_warpAffine.js
+++ b/modules/js/perf/perf_imgproc/perf_warpAffine.js
@@ -16,8 +16,13 @@ function perf() {
     console.log('opencv.js loaded');
     if (isNodeJs) {
       global.cv = cv;
+      global.fillGradient = HelpFunc.fillGradient;
+      global.smoothBorder = HelpFunc.smoothBorder;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/perf/perf_imgproc/perf_warpPerspective.js b/modules/js/perf/perf_imgproc/perf_warpPerspective.js
index dcde2fb22c..252729e3f0 100644
--- a/modules/js/perf/perf_imgproc/perf_warpPerspective.js
+++ b/modules/js/perf/perf_imgproc/perf_warpPerspective.js
@@ -16,8 +16,13 @@ function perf() {
     console.log('opencv.js loaded');
     if (isNodeJs) {
       global.cv = cv;
+      global.fillGradient = HelpFunc.fillGradient;
+      global.smoothBorder = HelpFunc.smoothBorder;
       global.combine = HelpFunc.combine;
-      global.cvtStr2cvSize = HelpFunc.cvtStr2cvSize;
+      global.log = HelpFunc.log;
+      global.decodeParams2Case = HelpFunc.decodeParams2Case;
+      global.setBenchmarkSuite = HelpFunc.setBenchmarkSuite;
+      global.addKernelCase = HelpFunc.addKernelCase
       global.cvSize = Base.getCvSize();
     } else {
       enableButton();
diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp
index 279ee78bb1..4409778e83 100644
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -87,8 +87,10 @@ namespace hal {
 using namespace emscripten;
 using namespace cv;
 
+using namespace cv::segmentation;  // FIXIT
+
 #ifdef HAVE_OPENCV_DNN
-using namespace dnn;
+using namespace cv::dnn;
 #endif
 
 #ifdef HAVE_OPENCV_ARUCO
diff --git a/modules/js/src/make_umd.py b/modules/js/src/make_umd.py
index 08d9e39e13..bed6ee9bcc 100644
--- a/modules/js/src/make_umd.py
+++ b/modules/js/src/make_umd.py
@@ -69,13 +69,16 @@
 import os, sys, re, json, shutil
 from subprocess import Popen, PIPE, STDOUT
 
+PY3 = sys.version_info >= (3, 0)
+
 def make_umd(opencvjs, cvjs):
-    src = open(opencvjs, 'r+b')
-    dst = open(cvjs, 'w+b')
-    content = src.read()
-    dst.seek(0)
-    # inspired by https://github.com/umdjs/umd/blob/95563fd6b46f06bda0af143ff67292e7f6ede6b7/templates/returnExportsGlobal.js
-    dst.write(("""
+    with open(opencvjs, 'r+b') as src:
+        content = src.read()
+    if PY3:  # content is bytes
+        content = content.decode('utf-8')
+    with open(cvjs, 'w+b') as dst:
+        # inspired by https://github.com/umdjs/umd/blob/95563fd6b46f06bda0af143ff67292e7f6ede6b7/templates/returnExportsGlobal.js
+        dst.write(("""
 (function (root, factory) {
   if (typeof define === 'function' && define.amd) {
     // AMD. Register as an anonymous module.
@@ -103,7 +106,8 @@ def make_umd(opencvjs, cvjs):
     Module = {};
   return cv(Module);
 }));
-    """ % (content)).lstrip().encode())
+        """ % (content)).lstrip().encode('utf-8'))
+
 
 if __name__ == "__main__":
     if len(sys.argv) > 2:
diff --git a/modules/js/test/test_imgproc.js b/modules/js/test/test_imgproc.js
index 9ba5cd4e38..ada315b390 100644
--- a/modules/js/test/test_imgproc.js
+++ b/modules/js/test/test_imgproc.js
@@ -977,3 +977,26 @@ QUnit.test('warpPolar', function(assert) {
      96,  83,  64,  45,  32
   ]);
 });
+
+
+QUnit.test('IntelligentScissorsMB', function(assert) {
+  const lines = new cv.Mat(50, 100, cv.CV_8U, new cv.Scalar(0));
+  lines.row(10).setTo(new cv.Scalar(255));
+  assert.ok(lines instanceof cv.Mat);
+
+  let tool = new cv.segmentation_IntelligentScissorsMB();
+  tool.applyImage(lines);
+  assert.ok(lines instanceof cv.Mat);
+  lines.delete();
+
+  tool.buildMap(new cv.Point(10, 10));
+
+  let contour = new cv.Mat();
+  tool.getContour(new cv.Point(50, 10), contour);
+  assert.equal(contour.type(), cv.CV_32SC2);
+  assert.ok(contour.total() == 41, contour.total());
+
+  tool.getContour(new cv.Point(80, 10), contour);
+  assert.equal(contour.type(), cv.CV_32SC2);
+  assert.ok(contour.total() == 71, contour.total());
+});
diff --git a/modules/js/test/test_objdetect.js b/modules/js/test/test_objdetect.js
index 79c357ae2f..dc863d682f 100644
--- a/modules/js/test/test_objdetect.js
+++ b/modules/js/test/test_objdetect.js
@@ -159,3 +159,44 @@ QUnit.test('Cascade classification', function(assert) {
         locations.delete();
     }
 });
+QUnit.test('QR code detect and decode', function (assert) {
+    {
+        const detector = new cv.QRCodeDetector();
+        let mat = cv.Mat.ones(800, 600, cv.CV_8U);
+        assert.ok(mat);
+
+        // test detect
+        let points = new cv.Mat();
+        let qrCodeFound = detector.detect(mat, points);
+        assert.equal(points.rows, 0)
+        assert.equal(points.cols, 0)
+        assert.equal(qrCodeFound, false);
+
+        // test detectMult
+        qrCodeFound = detector.detectMulti(mat, points);
+        assert.equal(points.rows, 0)
+        assert.equal(points.cols, 0)
+        assert.equal(qrCodeFound, false);
+
+        // test decode (with random numbers)
+        let decodeTestPoints = cv.matFromArray(1, 4, cv.CV_32FC2, [10, 20, 30, 40, 60, 80, 90, 100]);
+        let qrCodeContent = detector.decode(mat, decodeTestPoints);
+        assert.equal(typeof qrCodeContent, 'string');
+        assert.equal(qrCodeContent, '');
+
+        //test detectAndDecode
+        qrCodeContent = detector.detectAndDecode(mat);
+        assert.equal(typeof qrCodeContent, 'string');
+        assert.equal(qrCodeContent, '');
+
+        // test decodeCurved
+        qrCodeContent = detector.decodeCurved(mat, decodeTestPoints);
+        assert.equal(typeof qrCodeContent, 'string');
+        assert.equal(qrCodeContent, '');
+
+        decodeTestPoints.delete();
+        points.delete();
+        mat.delete();
+
+    }
+});
\ No newline at end of file
diff --git a/modules/ml/src/lr.cpp b/modules/ml/src/lr.cpp
index ad7b8079a2..ed4fb4c720 100644
--- a/modules/ml/src/lr.cpp
+++ b/modules/ml/src/lr.cpp
@@ -1,57 +1,12 @@
-///////////////////////////////////////////////////////////////////////////////////////
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// AUTHOR: Rahul Kavi rahulkavi[at]live[at]com
 
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-
-// This is a implementation of the Logistic Regression algorithm in C++ in OpenCV.
-
-// AUTHOR:
-// Rahul Kavi rahulkavi[at]live[at]com
-
-// # You are free to use, change, or redistribute the code in any way you wish for
-// # non-commercial purposes, but please maintain the name of the original author.
-// # This code comes with no warranty of any kind.
-
-// #
-// # You are free to use, change, or redistribute the code in any way you wish for
-// # non-commercial purposes, but please maintain the name of the original author.
-// # This code comes with no warranty of any kind.
-
-// # Logistic Regression ALGORITHM
-
-
-//                           License Agreement
-//                For Open Source Computer Vision Library
-
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
+//
+// This is a implementation of the Logistic Regression algorithm
+//
 
 #include "precomp.hpp"
 
diff --git a/modules/ml/test/test_lr.cpp b/modules/ml/test/test_lr.cpp
index d68266cc6f..ec77fcbdda 100644
--- a/modules/ml/test/test_lr.cpp
+++ b/modules/ml/test/test_lr.cpp
@@ -1,11 +1,13 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
+//
+// AUTHOR: Rahul Kavi rahulkavi[at]live[at]com
 
-// This is a implementation of the Logistic Regression algorithm in C++ in OpenCV.
-
-// AUTHOR:
-// Rahul Kavi rahulkavi[at]live[at]com
+//
+// Test data uses subset of data from the popular Iris Dataset (1936):
+// - http://archive.ics.uci.edu/ml/datasets/Iris
+// - https://en.wikipedia.org/wiki/Iris_flower_data_set
 //
 
 #include "test_precomp.hpp"
diff --git a/modules/objc/generator/gen_objc.py b/modules/objc/generator/gen_objc.py
index 3f51402ad2..8cb4188970 100755
--- a/modules/objc/generator/gen_objc.py
+++ b/modules/objc/generator/gen_objc.py
@@ -576,7 +576,7 @@ def get_swift_type(ctype):
     return swift_type
 
 def build_swift_extension_decl(name, args, constructor, static, ret_type):
-    extension_decl = ("class " if static else "") + (("func " + name) if not constructor else "convenience init") + "("
+    extension_decl = "@nonobjc " + ("class " if static else "") + (("func " + name) if not constructor else "convenience init") + "("
     swift_args = []
     for a in args:
         if a.ctype not in type_dict:
@@ -1617,7 +1617,7 @@ if __name__ == "__main__":
             with open(srcfiles_fname) as f:
                 srcfiles = [os.path.join(module_location, str(l).strip()) for l in f.readlines() if str(l).strip()]
         else:
-            re_bad = re.compile(r'(private|.inl.hpp$|_inl.hpp$|.details.hpp$|_winrt.hpp$|/cuda/|/legacy/)')
+            re_bad = re.compile(r'(private|.inl.hpp$|_inl.hpp$|.detail.hpp$|.details.hpp$|_winrt.hpp$|/cuda/|/legacy/)')
             # .h files before .hpp
             h_files = []
             hpp_files = []
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index c6700628a2..57646733fc 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -235,9 +235,11 @@ vector<Vec3d> QRDetect::searchHorizontalLines()
 vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
 {
     CV_TRACE_FUNCTION();
-
-    for (int coeff_epsilon = 1; coeff_epsilon < 10; coeff_epsilon++)
+    const double min_dist_between_points = 10.0;
+    const double max_ratio = 1.0;
+    for (int coeff_epsilon_i = 1; coeff_epsilon_i < 101; ++coeff_epsilon_i)
     {
+        const float coeff_epsilon = coeff_epsilon_i * 0.1f;
         vector<Point2f> point2f_result = extractVerticalLines(list_lines, eps_horizontal * coeff_epsilon);
         if (!point2f_result.empty())
         {
@@ -247,9 +249,23 @@ vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
                     point2f_result, 3, labels,
                     TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
                     3, KMEANS_PP_CENTERS, centers);
-            if (compactness == 0)
+            double min_dist = std::numeric_limits<double>::max();
+            for (size_t i = 0; i < centers.size(); i++)
+            {
+                double dist = norm(centers[i] - centers[(i+1) % centers.size()]);
+                if (dist < min_dist)
+                {
+                    min_dist = dist;
+                }
+            }
+            if (min_dist < min_dist_between_points)
+            {
                 continue;
-            if (compactness > 0)
+            }
+            double mean_compactness = compactness / point2f_result.size();
+            double ratio = mean_compactness / min_dist;
+
+            if (ratio < max_ratio)
             {
                 return point2f_result;
             }
@@ -456,7 +472,6 @@ bool QRDetect::localization()
     vector<Point2f> list_lines_y = separateVerticalLines(list_lines_x);
     if( list_lines_y.empty() ) { return false; }
 
-    vector<Point2f> centers;
     Mat labels;
     kmeans(list_lines_y, 3, labels,
            TermCriteria( TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
@@ -464,7 +479,7 @@ bool QRDetect::localization()
 
     fixationPoints(localization_points);
 
-    bool suare_flag = false, local_points_flag = false;
+    bool square_flag = false, local_points_flag = false;
     double triangle_sides[3];
     double triangle_perim, square_area, img_square_area;
     if (localization_points.size() == 3)
@@ -482,14 +497,14 @@ bool QRDetect::localization()
 
         if (square_area > (img_square_area * 0.2))
         {
-            suare_flag = true;
+            square_flag = true;
         }
     }
     else
     {
         local_points_flag = true;
     }
-    if ((suare_flag || local_points_flag) && purpose == SHRINKING)
+    if ((square_flag || local_points_flag) && purpose == SHRINKING)
     {
         localization_points.clear();
         bin_barcode = resized_bin_barcode.clone();
@@ -1962,6 +1977,13 @@ bool QRDecode::createSpline(vector<vector<Point2f> > &spline_lines)
             }
         }
     }
+    for (int i = 0; i < NUM_SIDES; i++)
+    {
+        if (spline_lines[i].size() == 0)
+        {
+            return false;
+        }
+    }
     return true;
 }
 
@@ -2469,12 +2491,13 @@ std::string QRCodeDetector::decode(InputArray in, InputArray points,
     bool ok = qrdec.straightDecodingProcess();
 
     std::string decoded_info = qrdec.getDecodeInformation();
-
-    if (ok && straight_qrcode.needed())
+    if (!ok && straight_qrcode.needed())
     {
-        qrdec.getStraightBarcode().convertTo(straight_qrcode,
-                                             straight_qrcode.fixedType() ?
-                                             straight_qrcode.type() : CV_32FC2);
+        straight_qrcode.release();
+    }
+    else if (straight_qrcode.needed())
+    {
+        qrdec.getStraightBarcode().convertTo(straight_qrcode, CV_8UC1);
     }
 
     return ok ? decoded_info : std::string();
@@ -2498,11 +2521,13 @@ cv::String QRCodeDetector::decodeCurved(InputArray in, InputArray points,
 
     std::string decoded_info = qrdec.getDecodeInformation();
 
-    if (ok && straight_qrcode.needed())
+    if (!ok && straight_qrcode.needed())
     {
-        qrdec.getStraightBarcode().convertTo(straight_qrcode,
-                                             straight_qrcode.fixedType() ?
-                                             straight_qrcode.type() : CV_32FC2);
+        straight_qrcode.release();
+    }
+    else if (straight_qrcode.needed())
+    {
+        qrdec.getStraightBarcode().convertTo(straight_qrcode, CV_8UC1);
     }
 
     return ok ? decoded_info : std::string();
@@ -3593,18 +3618,18 @@ bool QRCodeDetector::decodeMulti(
             for_copy.push_back(straight_barcode[i]);
     }
     straight_barcode = for_copy;
-    vector<Mat> tmp_straight_qrcodes;
-    if (straight_qrcode.needed())
+    if (straight_qrcode.needed() && straight_barcode.size() == 0)
     {
+        straight_qrcode.release();
+    }
+    else if (straight_qrcode.needed())
+    {
+        straight_qrcode.create(Size((int)straight_barcode.size(), 1), CV_8UC1);
+        vector<Mat> tmp_straight_qrcodes(straight_barcode.size());
         for (size_t i = 0; i < straight_barcode.size(); i++)
         {
-            Mat tmp_straight_qrcode;
-            tmp_straight_qrcodes.push_back(tmp_straight_qrcode);
-            straight_barcode[i].convertTo(((OutputArray)tmp_straight_qrcodes[i]),
-                                             ((OutputArray)tmp_straight_qrcodes[i]).fixedType() ?
-                                             ((OutputArray)tmp_straight_qrcodes[i]).type() : CV_32FC2);
+            straight_barcode[i].convertTo(tmp_straight_qrcodes[i], CV_8UC1);
         }
-        straight_qrcode.createSameSize(tmp_straight_qrcodes, CV_32FC2);
         straight_qrcode.assign(tmp_straight_qrcodes);
     }
     decoded_info.clear();
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index c26cd8a4f2..fc55498740 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -252,6 +252,8 @@ TEST_P(Objdetect_QRCode, regression)
     decoded_info = qrcode.detectAndDecode(src, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
+    int expected_barcode_type = CV_8UC1;
+    EXPECT_EQ(expected_barcode_type, straight_barcode.type());
 #else
     ASSERT_TRUE(qrcode.detect(src, corners));
 #endif
@@ -317,6 +319,8 @@ TEST_P(Objdetect_QRCode_Close, regression)
     decoded_info = qrcode.detectAndDecode(barcode, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
+    int expected_barcode_type = CV_8UC1;
+    EXPECT_EQ(expected_barcode_type, straight_barcode.type());
 #else
     ASSERT_TRUE(qrcode.detect(barcode, corners));
 #endif
@@ -382,6 +386,8 @@ TEST_P(Objdetect_QRCode_Monitor, regression)
     decoded_info = qrcode.detectAndDecode(barcode, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
+    int expected_barcode_type = CV_8UC1;
+    EXPECT_EQ(expected_barcode_type, straight_barcode.type());
 #else
     ASSERT_TRUE(qrcode.detect(barcode, corners));
 #endif
@@ -442,6 +448,8 @@ TEST_P(Objdetect_QRCode_Curved, regression)
     decoded_info = qrcode.detectAndDecodeCurved(src, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
+    int expected_barcode_type = CV_8UC1;
+    EXPECT_EQ(expected_barcode_type, straight_barcode.type());
 #else
     ASSERT_TRUE(qrcode.detect(src, corners));
 #endif
@@ -502,6 +510,9 @@ TEST_P(Objdetect_QRCode_Multi, regression)
     EXPECT_TRUE(qrcode.detectAndDecodeMulti(src, decoded_info, corners, straight_barcode));
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
+    int expected_barcode_type = CV_8UC1;
+    for(size_t i = 0; i < straight_barcode.size(); i++)
+        EXPECT_EQ(expected_barcode_type, straight_barcode[i].type());
 #else
     ASSERT_TRUE(qrcode.detectMulti(src, corners));
 #endif
@@ -612,6 +623,32 @@ TEST(Objdetect_QRCode_detectMulti, detect_regression_16961)
     EXPECT_EQ(corners.size(), expect_corners_size);
 }
 
+TEST(Objdetect_QRCode_decodeMulti, check_output_parameters_type_19363)
+{
+    const std::string name_current_image = "9_qrcodes.jpg";
+    const std::string root = "qrcode/multiple/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat src = imread(image_path);
+    ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+#ifdef HAVE_QUIRC
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    std::vector<cv::String> decoded_info;
+#if 0  // FIXIT: OutputArray::create() type check
+    std::vector<Mat2b> straight_barcode_nchannels;
+    EXPECT_ANY_THROW(qrcode.detectAndDecodeMulti(src, decoded_info, corners, straight_barcode_nchannels));
+#endif
+
+    int expected_barcode_type = CV_8UC1;
+    std::vector<Mat1b> straight_barcode;
+    EXPECT_TRUE(qrcode.detectAndDecodeMulti(src, decoded_info, corners, straight_barcode));
+    ASSERT_FALSE(corners.empty());
+    for(size_t i = 0; i < straight_barcode.size(); i++)
+        EXPECT_EQ(expected_barcode_type, straight_barcode[i].type());
+#endif
+}
+
 TEST(Objdetect_QRCode_basic, not_found_qrcode)
 {
     std::vector<Point> corners;
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index 2ac09dbecb..198b83470c 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -227,7 +227,11 @@ TEST(Photo_CalibrateDebevec, regression)
     diff = diff.mul(1.0f / response);
     double max;
     minMaxLoc(diff, NULL, &max);
-    ASSERT_FALSE(max > 0.1);
+#if defined(__arm__) || defined(__aarch64__)
+    ASSERT_LT(max, 0.131);
+#else
+    ASSERT_LT(max, 0.1);
+#endif
 }
 
 TEST(Photo_CalibrateRobertson, regression)
diff --git a/modules/python/bindings/CMakeLists.txt b/modules/python/bindings/CMakeLists.txt
index 4ad3d0c8d9..442107b135 100644
--- a/modules/python/bindings/CMakeLists.txt
+++ b/modules/python/bindings/CMakeLists.txt
@@ -11,7 +11,7 @@ set(PYTHON_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../")
 # get list of modules to wrap
 set(OPENCV_PYTHON_MODULES)
 foreach(m ${OPENCV_MODULES_BUILD})
-  if (";${OPENCV_MODULE_${m}_WRAPPERS};" MATCHES ";${MODULE_NAME};" AND HAVE_${m})
+  if (";${OPENCV_MODULE_${m}_WRAPPERS};" MATCHES ";python;" AND HAVE_${m})
     list(APPEND OPENCV_PYTHON_MODULES ${m})
     #message(STATUS "\t${m}")
   endif()
@@ -26,9 +26,18 @@ foreach(m ${OPENCV_PYTHON_MODULES})
       list(APPEND opencv_hdrs "${hdr}")
     endif()
   endforeach()
+
+  # both wrapping and C++ implementation
+  file(GLOB hdr2 ${OPENCV_MODULE_${m}_LOCATION}/misc/python/python_*.hpp)
+  list(SORT hdr2)
+  list(APPEND opencv_hdrs ${hdr2})
+  list(APPEND opencv_userdef_hdrs ${hdr2})
+
   file(GLOB hdr ${OPENCV_MODULE_${m}_LOCATION}/misc/python/shadow*.hpp)
+  list(SORT hdr)
   list(APPEND opencv_hdrs ${hdr})
   file(GLOB userdef_hdrs ${OPENCV_MODULE_${m}_LOCATION}/misc/python/pyopencv*.hpp)
+  list(SORT userdef_hdrs)
   list(APPEND opencv_userdef_hdrs ${userdef_hdrs})
 endforeach(m)
 
diff --git a/modules/python/package/cv2/__init__.py b/modules/python/package/cv2/__init__.py
index d367998b0b..940ac65732 100644
--- a/modules/python/package/cv2/__init__.py
+++ b/modules/python/package/cv2/__init__.py
@@ -18,6 +18,10 @@ except ImportError:
 
 def bootstrap():
     import sys
+
+    import copy
+    save_sys_path = copy.copy(sys.path)
+
     if hasattr(sys, 'OpenCV_LOADER'):
         print(sys.path)
         raise ImportError('ERROR: recursion is detected during loading of "cv2" binary extensions. Check OpenCV installation.')
@@ -85,6 +89,8 @@ def bootstrap():
     del sys.modules['cv2']
     import cv2
 
+    sys.path = save_sys_path  # multiprocessing should start from bootstrap code (https://github.com/opencv/opencv/issues/18502)
+
     try:
         import sys
         del sys.OpenCV_LOADER
diff --git a/modules/python/package/setup.py b/modules/python/package/setup.py
index d41606e644..0223b8d3ae 100644
--- a/modules/python/package/setup.py
+++ b/modules/python/package/setup.py
@@ -45,6 +45,7 @@ def main():
           'Programming Language :: Python :: 3.6',
           'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8',
+          'Programming Language :: Python :: 3.9',
           'Programming Language :: C++',
           'Programming Language :: Python :: Implementation :: CPython',
           'Topic :: Scientific/Engineering',
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index feea5e76f2..9e8a6ee13b 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -32,12 +32,14 @@
 
 #include <numpy/ndarrayobject.h>
 
+#include "opencv2/opencv_modules.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/utils/configuration.private.hpp"
 #include "opencv2/core/utils/logger.hpp"
+#include "opencv2/core/utils/tls.hpp"
 
 #include "pyopencv_generated_include.h"
 #include "opencv2/core/types_c.h"
-#include "opencv2/opencv_modules.hpp"
 #include "pycompat.hpp"
 #include <map>
 
@@ -45,6 +47,7 @@
 
 #define CV_HAS_CONVERSION_ERROR(x) (((x) == -1) && PyErr_Occurred())
 
+static PyObject* opencv_error = NULL;
 
 class ArgInfo
 {
@@ -67,14 +70,32 @@ struct PyOpenCV_Converter
     //static inline PyObject* from(const T& src);
 };
 
+// exception-safe pyopencv_to
+template<typename _Tp> static
+bool pyopencv_to_safe(PyObject* obj, _Tp& value, const ArgInfo& info)
+{
+    try
+    {
+        return pyopencv_to(obj, value, info);
+    }
+    catch (const std::exception &e)
+    {
+        PyErr_SetString(opencv_error, cv::format("Conversion error: %s, what: %s", info.name, e.what()).c_str());
+        return false;
+    }
+    catch (...)
+    {
+        PyErr_SetString(opencv_error, cv::format("Conversion error: %s", info.name).c_str());
+        return false;
+    }
+}
+
 template<typename T> static
 bool pyopencv_to(PyObject* obj, T& p, const ArgInfo& info) { return PyOpenCV_Converter<T>::to(obj, p, info); }
 
 template<typename T> static
 PyObject* pyopencv_from(const T& src) { return PyOpenCV_Converter<T>::from(src); }
 
-static PyObject* opencv_error = NULL;
-
 static bool isPythonBindingsDebugEnabled()
 {
     static bool param_debug = cv::utils::getConfigurationParameterBool("OPENCV_PYTHON_DEBUG", false);
@@ -141,6 +162,51 @@ private:
     PyGILState_STATE _state;
 };
 
+/**
+ * Light weight RAII wrapper for `PyObject*` owning references.
+ * In comparisson to C++11 `std::unique_ptr` with custom deleter, it provides
+ * implicit conversion functions that might be useful to initialize it with
+ * Python functions those returns owning references through the `PyObject**`
+ * e.g. `PyErr_Fetch` or directly pass it to functions those want to borrow
+ * reference to object (doesn't extend object lifetime) e.g. `PyObject_Str`.
+ */
+class PySafeObject
+{
+public:
+    PySafeObject() : obj_(NULL) {}
+
+    explicit PySafeObject(PyObject* obj) : obj_(obj) {}
+
+    ~PySafeObject()
+    {
+        Py_CLEAR(obj_);
+    }
+
+    operator PyObject*()
+    {
+        return obj_;
+    }
+
+    operator PyObject**()
+    {
+        return &obj_;
+    }
+
+    PyObject* release()
+    {
+        PyObject* obj = obj_;
+        obj_ = NULL;
+        return obj;
+    }
+
+private:
+    PyObject* obj_;
+
+    // Explicitly disable copy operations
+    PySafeObject(const PySafeObject*); // = delete
+    PySafeObject& operator=(const PySafeObject&); // = delete
+};
+
 static void pyRaiseCVException(const cv::Exception &e)
 {
     PyObject_SetAttrString(opencv_error, "file", PyString_FromString(e.file.c_str()));
@@ -162,6 +228,16 @@ catch (const cv::Exception &e) \
 { \
     pyRaiseCVException(e); \
     return 0; \
+} \
+catch (const std::exception &e) \
+{ \
+    PyErr_SetString(opencv_error, e.what()); \
+    return 0; \
+} \
+catch (...) \
+{ \
+    PyErr_SetString(opencv_error, "Unknown C++ exception from OpenCV code"); \
+    return 0; \
 }
 
 using namespace cv;
@@ -293,6 +369,131 @@ bool parseNumpyScalar(PyObject* obj, T& value)
     return false;
 }
 
+TLSData<std::vector<std::string> > conversionErrorsTLS;
+
+inline void pyPrepareArgumentConversionErrorsStorage(std::size_t size)
+{
+    std::vector<std::string>& conversionErrors = conversionErrorsTLS.getRef();
+    conversionErrors.clear();
+    conversionErrors.reserve(size);
+}
+
+void pyRaiseCVOverloadException(const std::string& functionName)
+{
+    const std::vector<std::string>& conversionErrors = conversionErrorsTLS.getRef();
+    const std::size_t conversionErrorsCount = conversionErrors.size();
+    if (conversionErrorsCount > 0)
+    {
+        // In modern std libraries small string optimization is used = no dynamic memory allocations,
+        // but it can be applied only for string with length < 18 symbols (in GCC)
+        const std::string bullet = "\n - ";
+
+        // Estimate required buffer size - save dynamic memory allocations = faster
+        std::size_t requiredBufferSize = bullet.size() * conversionErrorsCount;
+        for (std::size_t i = 0; i < conversionErrorsCount; ++i)
+        {
+            requiredBufferSize += conversionErrors[i].size();
+        }
+
+        // Only string concatenation is required so std::string is way faster than
+        // std::ostringstream
+        std::string errorMessage("Overload resolution failed:");
+        errorMessage.reserve(errorMessage.size() + requiredBufferSize);
+        for (std::size_t i = 0; i < conversionErrorsCount; ++i)
+        {
+            errorMessage += bullet;
+            errorMessage += conversionErrors[i];
+        }
+        cv::Exception exception(CV_StsBadArg, errorMessage, functionName, "", -1);
+        pyRaiseCVException(exception);
+    }
+    else
+    {
+        cv::Exception exception(CV_StsInternal, "Overload resolution failed, but no errors reported",
+                                functionName, "", -1);
+        pyRaiseCVException(exception);
+    }
+}
+
+void pyPopulateArgumentConversionErrors()
+{
+    if (PyErr_Occurred())
+    {
+        PySafeObject exception_type;
+        PySafeObject exception_value;
+        PySafeObject exception_traceback;
+        PyErr_Fetch(exception_type, exception_value, exception_traceback);
+        PyErr_NormalizeException(exception_type, exception_value,
+                                 exception_traceback);
+
+        PySafeObject exception_message(PyObject_Str(exception_value));
+        std::string message;
+        getUnicodeString(exception_message, message);
+#ifdef CV_CXX11
+        conversionErrorsTLS.getRef().push_back(std::move(message));
+#else
+        conversionErrorsTLS.getRef().push_back(message);
+#endif
+    }
+}
+
+struct SafeSeqItem
+{
+    PyObject * item;
+    SafeSeqItem(PyObject *obj, size_t idx) { item = PySequence_GetItem(obj, idx); }
+    ~SafeSeqItem() { Py_XDECREF(item); }
+
+private:
+    SafeSeqItem(const SafeSeqItem&); // = delete
+    SafeSeqItem& operator=(const SafeSeqItem&); // = delete
+};
+
+template <class T>
+class RefWrapper
+{
+public:
+    RefWrapper(T& item) : item_(item) {}
+
+    T& get() CV_NOEXCEPT { return item_; }
+
+private:
+    T& item_;
+};
+
+// In order to support this conversion on 3.x branch - use custom reference_wrapper
+// and C-style array instead of std::array<T, N>
+template <class T, std::size_t N>
+bool parseSequence(PyObject* obj, RefWrapper<T> (&value)[N], const ArgInfo& info)
+{
+    if (!obj || obj == Py_None)
+    {
+        return true;
+    }
+    if (!PySequence_Check(obj))
+    {
+        failmsg("Can't parse '%s'. Input argument doesn't provide sequence "
+                "protocol", info.name);
+        return false;
+    }
+    const std::size_t sequenceSize = PySequence_Size(obj);
+    if (sequenceSize != N)
+    {
+        failmsg("Can't parse '%s'. Expected sequence length %lu, got %lu",
+                info.name, N, sequenceSize);
+        return false;
+    }
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        SafeSeqItem seqItem(obj, i);
+        if (!pyopencv_to(seqItem.item, value[i].get(), info))
+        {
+            failmsg("Can't parse '%s'. Sequence item with index %lu has a "
+                    "wrong type", info.name, i);
+            return false;
+        }
+    }
+    return true;
+}
 } // namespace
 
 typedef std::vector<uchar> vector_uchar;
@@ -667,13 +868,6 @@ static PyObject* pyopencv_from(void*& ptr)
     return PyLong_FromVoidPtr(ptr);
 }
 
-struct SafeSeqItem
-{
-    PyObject * item;
-    SafeSeqItem(PyObject *obj, size_t idx) { item = PySequence_GetItem(obj, idx); }
-    ~SafeSeqItem() { Py_XDECREF(item); }
-};
-
 static bool pyopencv_to(PyObject *o, Scalar& s, const ArgInfo& info)
 {
     if(!o || o == Py_None)
@@ -993,25 +1187,40 @@ PyObject* pyopencv_from(const std::string& value)
 template<>
 bool pyopencv_to(PyObject* obj, String &value, const ArgInfo& info)
 {
-    CV_UNUSED(info);
     if(!obj || obj == Py_None)
+    {
         return true;
+    }
     std::string str;
     if (getUnicodeString(obj, str))
     {
         value = str;
         return true;
     }
+    else
+    {
+        // If error hasn't been already set by Python conversion functions
+        if (!PyErr_Occurred())
+        {
+            // Direct access to underlying slots of PyObjectType is not allowed
+            // when limited API is enabled
+#ifdef Py_LIMITED_API
+            failmsg("Can't convert object to 'str' for '%s'", info.name);
+#else
+            failmsg("Can't convert object of type '%s' to 'str' for '%s'",
+                    obj->ob_type->tp_name, info.name);
+#endif
+        }
+    }
     return false;
 }
 
 template<>
 bool pyopencv_to(PyObject* obj, Size& sz, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    return PyArg_ParseTuple(obj, "ii", &sz.width, &sz.height) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(sz.width),
+                                RefWrapper<int>(sz.height)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1023,10 +1232,9 @@ PyObject* pyopencv_from(const Size& sz)
 template<>
 bool pyopencv_to(PyObject* obj, Size_<float>& sz, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    return PyArg_ParseTuple(obj, "ff", &sz.width, &sz.height) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(sz.width),
+                                  RefWrapper<float>(sz.height)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1035,6 +1243,15 @@ PyObject* pyopencv_from(const Size_<float>& sz)
     return Py_BuildValue("(ff)", sz.width, sz.height);
 }
 
+template<>
+bool pyopencv_to(PyObject* obj, Rect& r, const ArgInfo& info)
+{
+    RefWrapper<int> values[] = {RefWrapper<int>(r.x), RefWrapper<int>(r.y),
+                                RefWrapper<int>(r.width),
+                                RefWrapper<int>(r.height)};
+    return parseSequence(obj, values, info);
+}
+
 template<>
 PyObject* pyopencv_from(const Rect& r)
 {
@@ -1044,10 +1261,10 @@ PyObject* pyopencv_from(const Rect& r)
 template<>
 bool pyopencv_to(PyObject* obj, Rect2d& r, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    return PyArg_ParseTuple(obj, "dddd", &r.x, &r.y, &r.width, &r.height) > 0;
+    RefWrapper<double> values[] = {
+        RefWrapper<double>(r.x), RefWrapper<double>(r.y),
+        RefWrapper<double>(r.width), RefWrapper<double>(r.height)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1059,44 +1276,17 @@ PyObject* pyopencv_from(const Rect2d& r)
 template<>
 bool pyopencv_to(PyObject* obj, Range& r, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    while (PySequence_Check(obj))
+    if (!obj || obj == Py_None)
     {
-        if (2 != PySequence_Size(obj))
-        {
-            failmsg("Range value for argument '%s' is longer than 2", info.name);
-            return false;
-        }
-        {
-            SafeSeqItem item_wrap(obj, 0);
-            PyObject *item = item_wrap.item;
-            if (PyInt_Check(item)) {
-                r.start = (int)PyInt_AsLong(item);
-            } else {
-                failmsg("Range.start value for argument '%s' is not integer", info.name);
-                break;
-            }
-        }
-        {
-            SafeSeqItem item_wrap(obj, 1);
-            PyObject *item = item_wrap.item;
-            if (PyInt_Check(item)) {
-                r.end = (int)PyInt_AsLong(item);
-            } else {
-                failmsg("Range.end value for argument '%s' is not integer", info.name);
-                break;
-            }
-        }
         return true;
     }
-    if(PyObject_Size(obj) == 0)
+    if (PyObject_Size(obj) == 0)
     {
         r = Range::all();
         return true;
     }
-    return PyArg_ParseTuple(obj, "ii", &r.start, &r.end) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(r.start), RefWrapper<int>(r.end)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1108,64 +1298,42 @@ PyObject* pyopencv_from(const Range& r)
 template<>
 bool pyopencv_to(PyObject* obj, Point& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    if(PyComplex_Check(obj))
-    {
-        p.x = saturate_cast<int>(PyComplex_RealAsDouble(obj));
-        p.y = saturate_cast<int>(PyComplex_ImagAsDouble(obj));
-        return true;
-    }
-    return PyArg_ParseTuple(obj, "ii", &p.x, &p.y) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(p.x), RefWrapper<int>(p.y)};
+    return parseSequence(obj, values, info);
 }
 
-template<>
+template <>
 bool pyopencv_to(PyObject* obj, Point2f& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    if (PyComplex_Check(obj))
-    {
-        p.x = saturate_cast<float>(PyComplex_RealAsDouble(obj));
-        p.y = saturate_cast<float>(PyComplex_ImagAsDouble(obj));
-        return true;
-    }
-    return PyArg_ParseTuple(obj, "ff", &p.x, &p.y) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(p.x),
+                                  RefWrapper<float>(p.y)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
 bool pyopencv_to(PyObject* obj, Point2d& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    if(PyComplex_Check(obj))
-    {
-        p.x = PyComplex_RealAsDouble(obj);
-        p.y = PyComplex_ImagAsDouble(obj);
-        return true;
-    }
-    return PyArg_ParseTuple(obj, "dd", &p.x, &p.y) > 0;
+    RefWrapper<double> values[] = {RefWrapper<double>(p.x),
+                                   RefWrapper<double>(p.y)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
 bool pyopencv_to(PyObject* obj, Point3f& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    return PyArg_ParseTuple(obj, "fff", &p.x, &p.y, &p.z) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(p.x),
+                                  RefWrapper<float>(p.y),
+                                  RefWrapper<float>(p.z)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
 bool pyopencv_to(PyObject* obj, Point3d& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-    return PyArg_ParseTuple(obj, "ddd", &p.x, &p.y, &p.z) > 0;
+    RefWrapper<double> values[] = {RefWrapper<double>(p.x),
+                                   RefWrapper<double>(p.y),
+                                   RefWrapper<double>(p.z)};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1188,74 +1356,66 @@ PyObject* pyopencv_from(const Point3f& p)
 
 static bool pyopencv_to(PyObject* obj, Vec4d& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "dddd", &v[0], &v[1], &v[2], &v[3]) > 0;
+    RefWrapper<double> values[] = {RefWrapper<double>(v[0]), RefWrapper<double>(v[1]),
+                                   RefWrapper<double>(v[2]), RefWrapper<double>(v[3])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec4f& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "ffff", &v[0], &v[1], &v[2], &v[3]) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(v[0]), RefWrapper<float>(v[1]),
+                                  RefWrapper<float>(v[2]), RefWrapper<float>(v[3])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec4i& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "iiii", &v[0], &v[1], &v[2], &v[3]) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(v[0]), RefWrapper<int>(v[1]),
+                                RefWrapper<int>(v[2]), RefWrapper<int>(v[3])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec3d& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "ddd", &v[0], &v[1], &v[2]) > 0;
+    RefWrapper<double> values[] = {RefWrapper<double>(v[0]),
+                                   RefWrapper<double>(v[1]),
+                                   RefWrapper<double>(v[2])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec3f& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "fff", &v[0], &v[1], &v[2]) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(v[0]),
+                                  RefWrapper<float>(v[1]),
+                                  RefWrapper<float>(v[2])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec3i& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "iii", &v[0], &v[1], &v[2]) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(v[0]), RefWrapper<int>(v[1]),
+                                RefWrapper<int>(v[2])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec2d& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "dd", &v[0], &v[1]) > 0;
+    RefWrapper<double> values[] = {RefWrapper<double>(v[0]),
+                                   RefWrapper<double>(v[1])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec2f& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "ff", &v[0], &v[1]) > 0;
+    RefWrapper<float> values[] = {RefWrapper<float>(v[0]),
+                                  RefWrapper<float>(v[1])};
+    return parseSequence(obj, values, info);
 }
 
 static bool pyopencv_to(PyObject* obj, Vec2i& v, ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if (!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "ii", &v[0], &v[1]) > 0;
+    RefWrapper<int> values[] = {RefWrapper<int>(v[0]), RefWrapper<int>(v[1])};
+    return parseSequence(obj, values, info);
 }
 
 template<>
@@ -1422,8 +1582,13 @@ template<typename _Tp> struct pyopencvVecConverter
                     break;
                 }
             }
+            if (i != n)
+            {
+                failmsg("Can't convert vector element for '%s', index=%d", info.name, i);
+            }
             return i == n;
         }
+        failmsg("Can't convert object to vector for '%s', unsupported type", info.name);
         return false;
     }
 
@@ -1467,13 +1632,53 @@ template<typename _Tp> static inline bool pyopencv_to_generic_vec(PyObject* obj,
     return true;
 }
 
+template<> inline bool pyopencv_to_generic_vec(PyObject* obj, std::vector<bool>& value, const ArgInfo& info)
+{
+    if(!obj || obj == Py_None)
+       return true;
+    if (!PySequence_Check(obj))
+        return false;
+    size_t n = PySequence_Size(obj);
+    value.resize(n);
+    for(size_t i = 0; i < n; i++ )
+    {
+        SafeSeqItem item_wrap(obj, i);
+        bool elem{};
+        if(!pyopencv_to(item_wrap.item, elem, info))
+            return false;
+        value[i] = elem;
+    }
+    return true;
+}
+
 template<typename _Tp> static inline PyObject* pyopencv_from_generic_vec(const std::vector<_Tp>& value)
 {
     int i, n = (int)value.size();
     PyObject* seq = PyList_New(n);
     for( i = 0; i < n; i++ )
     {
-        PyObject* item = pyopencv_from(value[i]);
+        _Tp elem = value[i];
+        PyObject* item = pyopencv_from(elem);
+        if(!item)
+            break;
+        PyList_SetItem(seq, i, item);
+    }
+    if( i < n )
+    {
+        Py_DECREF(seq);
+        return 0;
+    }
+    return seq;
+}
+
+template<> inline PyObject* pyopencv_from_generic_vec(const std::vector<bool>& value)
+{
+    int i, n = (int)value.size();
+    PyObject* seq = PyList_New(n);
+    for( i = 0; i < n; i++ )
+    {
+        bool elem = value[i];
+        PyObject* item = pyopencv_from(elem);
         if(!item)
             break;
         PyList_SetItem(seq, i, item);
@@ -1631,31 +1836,54 @@ template<> struct pyopencvVecConverter<RotatedRect>
 };
 
 template<>
-bool pyopencv_to(PyObject* obj, Rect& r, const ArgInfo& info)
+bool pyopencv_to(PyObject* obj, TermCriteria& dst, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj || obj == Py_None)
-        return true;
-
-    if (PyTuple_Check(obj))
-        return PyArg_ParseTuple(obj, "iiii", &r.x, &r.y, &r.width, &r.height) > 0;
-    else
+    if (!obj || obj == Py_None)
     {
-        std::vector<int> value(4);
-        pyopencvVecConverter<int>::to(obj, value, info);
-        r = Rect(value[0], value[1], value[2], value[3]);
         return true;
     }
-
-}
-
-template<>
-bool pyopencv_to(PyObject *obj, TermCriteria& dst, const ArgInfo& info)
-{
-    CV_UNUSED(info);
-    if(!obj)
-        return true;
-    return PyArg_ParseTuple(obj, "iid", &dst.type, &dst.maxCount, &dst.epsilon) > 0;
+    if (!PySequence_Check(obj))
+    {
+        failmsg("Can't parse '%s' as TermCriteria."
+                "Input argument doesn't provide sequence protocol",
+                info.name);
+        return false;
+    }
+    const std::size_t sequenceSize = PySequence_Size(obj);
+    if (sequenceSize != 3) {
+        failmsg("Can't parse '%s' as TermCriteria. Expected sequence length 3, "
+                "got %lu",
+                info.name, sequenceSize);
+        return false;
+    }
+    {
+        const String typeItemName = format("'%s' criteria type", info.name);
+        const ArgInfo typeItemInfo(typeItemName.c_str(), false);
+        SafeSeqItem typeItem(obj, 0);
+        if (!pyopencv_to(typeItem.item, dst.type, typeItemInfo))
+        {
+            return false;
+        }
+    }
+    {
+        const String maxCountItemName = format("'%s' max count", info.name);
+        const ArgInfo maxCountItemInfo(maxCountItemName.c_str(), false);
+        SafeSeqItem maxCountItem(obj, 1);
+        if (!pyopencv_to(maxCountItem.item, dst.maxCount, maxCountItemInfo))
+        {
+            return false;
+        }
+    }
+    {
+        const String epsilonItemName = format("'%s' epsilon", info.name);
+        const ArgInfo epsilonItemInfo(epsilonItemName.c_str(), false);
+        SafeSeqItem epsilonItem(obj, 2);
+        if (!pyopencv_to(epsilonItem.item, dst.epsilon, epsilonItemInfo))
+        {
+            return false;
+        }
+    }
+    return true;
 }
 
 template<>
@@ -1665,12 +1893,54 @@ PyObject* pyopencv_from(const TermCriteria& src)
 }
 
 template<>
-bool pyopencv_to(PyObject *obj, RotatedRect& dst, const ArgInfo& info)
+bool pyopencv_to(PyObject* obj, RotatedRect& dst, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    if(!obj)
+    if (!obj || obj == Py_None)
+    {
         return true;
-    return PyArg_ParseTuple(obj, "(ff)(ff)f", &dst.center.x, &dst.center.y, &dst.size.width, &dst.size.height, &dst.angle) > 0;
+    }
+    if (!PySequence_Check(obj))
+    {
+        failmsg("Can't parse '%s' as RotatedRect."
+                "Input argument doesn't provide sequence protocol",
+                info.name);
+        return false;
+    }
+    const std::size_t sequenceSize = PySequence_Size(obj);
+    if (sequenceSize != 3)
+    {
+        failmsg("Can't parse '%s' as RotatedRect. Expected sequence length 3, got %lu",
+                info.name, sequenceSize);
+        return false;
+    }
+    {
+        const String centerItemName = format("'%s' center point", info.name);
+        const ArgInfo centerItemInfo(centerItemName.c_str(), false);
+        SafeSeqItem centerItem(obj, 0);
+        if (!pyopencv_to(centerItem.item, dst.center, centerItemInfo))
+        {
+            return false;
+        }
+    }
+    {
+        const String sizeItemName = format("'%s' size", info.name);
+        const ArgInfo sizeItemInfo(sizeItemName.c_str(), false);
+        SafeSeqItem sizeItem(obj, 1);
+        if (!pyopencv_to(sizeItem.item, dst.size, sizeItemInfo))
+        {
+            return false;
+        }
+    }
+    {
+        const String angleItemName = format("'%s' angle", info.name);
+        const ArgInfo angleItemInfo(angleItemName.c_str(), false);
+        SafeSeqItem angleItem(obj, 2);
+        if (!pyopencv_to(angleItem.item, dst.angle, angleItemInfo))
+        {
+            return false;
+        }
+    }
+    return true;
 }
 
 template<>
@@ -1928,9 +2198,9 @@ static int convert_to_char(PyObject *o, char *dst, const ArgInfo& info)
 #include "pyopencv_generated_enums.h"
 
 #ifdef CVPY_DYNAMIC_INIT
-#define CVPY_TYPE(NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE_DYNAMIC(NAME, STORAGE, SNAME)
+#define CVPY_TYPE(WNAME, NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE_DYNAMIC(WNAME, NAME, STORAGE, SNAME)
 #else
-#define CVPY_TYPE(NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE(NAME, STORAGE, SNAME)
+#define CVPY_TYPE(WNAME, NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE(WNAME, NAME, STORAGE, SNAME)
 #endif
 #include "pyopencv_generated_types.h"
 #undef CVPY_TYPE
@@ -1939,7 +2209,6 @@ static int convert_to_char(PyObject *o, char *dst, const ArgInfo& info)
 #include "pyopencv_generated_types_content.h"
 #include "pyopencv_generated_funcs.h"
 
-
 static PyMethodDef special_methods[] = {
   {"redirectError", CV_PY_FN_WITH_KW(pycvRedirectError), "redirectError(onError) -> None"},
 #ifdef HAVE_OPENCV_HIGHGUI
@@ -1954,7 +2223,8 @@ static PyMethodDef special_methods[] = {
 #ifdef HAVE_OPENCV_GAPI
   {"GIn", CV_PY_FN_WITH_KW(pyopencv_cv_GIn), "GIn(...) -> GInputProtoArgs"},
   {"GOut", CV_PY_FN_WITH_KW(pyopencv_cv_GOut), "GOut(...) -> GOutputProtoArgs"},
-  {"gin", CV_PY_FN_WITH_KW(pyopencv_cv_gin), "gin(...) -> GRunArgs"},
+  {"gin", CV_PY_FN_WITH_KW(pyopencv_cv_gin), "gin(...) -> ExtractArgsCallback"},
+  {"descr_of", CV_PY_FN_WITH_KW(pyopencv_cv_descr_of), "descr_of(...) -> ExtractMetaCallback"},
 #endif
   {NULL, NULL},
 };
@@ -2019,10 +2289,10 @@ static bool init_body(PyObject * m)
 #undef CVPY_MODULE
 
 #ifdef CVPY_DYNAMIC_INIT
-#define CVPY_TYPE(NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_DYNAMIC(NAME, return false, BASE, CONSTRUCTOR)
+#define CVPY_TYPE(WNAME, NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_DYNAMIC(WNAME, NAME, return false, BASE, CONSTRUCTOR)
     PyObject * pyopencv_NoBase_TypePtr = NULL;
 #else
-#define CVPY_TYPE(NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_STATIC(NAME, return false, BASE, CONSTRUCTOR)
+#define CVPY_TYPE(WNAME, NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_STATIC(WNAME, NAME, return false, BASE, CONSTRUCTOR)
     PyTypeObject * pyopencv_NoBase_TypePtr = NULL;
 #endif
     #include "pyopencv_generated_types.h"
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 25972fd1cd..08de11d861 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -47,7 +47,7 @@ gen_template_func_body = Template("""$code_decl
 gen_template_mappable = Template("""
     {
         ${mappable} _src;
-        if (pyopencv_to(src, _src, info))
+        if (pyopencv_to_safe(src, _src, info))
         {
             return cv_mappable_to(_src, dst);
         }
@@ -91,7 +91,7 @@ gen_template_set_prop_from_map = Template("""
     if( PyMapping_HasKeyString(src, (char*)"$propname") )
     {
         tmp = PyMapping_GetItemString(src, (char*)"$propname");
-        ok = tmp && pyopencv_to(tmp, dst.$propname, ArgInfo("$propname", false));
+        ok = tmp && pyopencv_to_safe(tmp, dst.$propname, ArgInfo("$propname", false));
         Py_DECREF(tmp);
         if(!ok) return false;
     }""")
@@ -145,7 +145,7 @@ static int pyopencv_${name}_set_${member}(pyopencv_${name}_t* p, PyObject *value
         PyErr_SetString(PyExc_TypeError, "Cannot delete the ${member} attribute");
         return -1;
     }
-    return pyopencv_to(value, p->v${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, p->v${access}${member}, ArgInfo("value", false)) ? 0 : -1;
 }
 """)
 
@@ -163,7 +163,7 @@ static int pyopencv_${name}_set_${member}(pyopencv_${name}_t* p, PyObject *value
         failmsgp("Incorrect type of object (must be '${name}' or its derivative)");
         return -1;
     }
-    return pyopencv_to(value, _self_${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, _self_${access}${member}, ArgInfo("value", false)) ? 0 : -1;
 }
 """)
 
@@ -174,6 +174,14 @@ gen_template_prop_init = Template("""
 gen_template_rw_prop_init = Template("""
     {(char*)"${member}", (getter)pyopencv_${name}_get_${member}, (setter)pyopencv_${name}_set_${member}, (char*)"${member}", NULL},""")
 
+gen_template_overloaded_function_call = Template("""
+    {
+${variant}
+
+        pyPopulateArgumentConversionErrors();
+    }
+""")
+
 class FormatStrings:
     string = 's'
     unsigned_char = 'b'
@@ -260,7 +268,12 @@ class ClassInfo(object):
 
             for m in decl[2]:
                 if m.startswith("="):
-                    self.wname = m[1:]
+                    wname = m[1:]
+                    npos = name.rfind('.')
+                    if npos >= 0:
+                        self.wname = normalize_class_name(name[:npos] + '.' + wname)
+                    else:
+                        self.wname = wname
                     customname = True
                 elif m == "/Map":
                     self.ismap = True
@@ -278,7 +291,7 @@ class ClassInfo(object):
         code = "static bool pyopencv_to(PyObject* src, %s& dst, const ArgInfo& info)\n{\n    PyObject* tmp;\n    bool ok;\n" % (self.cname)
         code += "".join([gen_template_set_prop_from_map.substitute(propname=p.name,proptype=p.tp) for p in self.props])
         if self.base:
-            code += "\n    return pyopencv_to(src, (%s&)dst, info);\n}\n" % all_classes[self.base].cname
+            code += "\n    return pyopencv_to_safe(src, (%s&)dst, info);\n}\n" % all_classes[self.base].cname
         else:
             code += "\n    return true;\n}\n"
         return code
@@ -341,7 +354,8 @@ class ClassInfo(object):
         if self.constructor is not None:
             constructor_name = self.constructor.get_wrapper_name()
 
-        return "CVPY_TYPE({}, {}, {}, {}, {});\n".format(
+        return "CVPY_TYPE({}, {}, {}, {}, {}, {});\n".format(
+            self.wname,
             self.name,
             self.cname if self.issimple else "Ptr<{}>".format(self.cname),
             self.sname if self.issimple else "Ptr",
@@ -693,7 +707,7 @@ class FuncInfo(object):
                         if a.tp == 'char':
                             code_cvt_list.append("convert_to_char(pyobj_%s, &%s, %s)" % (a.name, a.name, a.crepr()))
                         else:
-                            code_cvt_list.append("pyopencv_to(pyobj_%s, %s, %s)" % (a.name, a.name, a.crepr()))
+                            code_cvt_list.append("pyopencv_to_safe(pyobj_%s, %s, %s)" % (a.name, a.name, a.crepr()))
 
                 all_cargs.append([arg_type_info, parse_name])
 
@@ -823,8 +837,12 @@ class FuncInfo(object):
             # if the function/method has only 1 signature, then just put it
             code += all_code_variants[0]
         else:
-            # try to execute each signature
-            code += "    PyErr_Clear();\n\n".join(["    {\n" + v + "    }\n" for v in all_code_variants])
+            # try to execute each signature, add an interlude between function
+            # calls to collect error from all conversions
+            code += '    pyPrepareArgumentConversionErrorsStorage({});\n'.format(len(all_code_variants))
+            code += '    \n'.join(gen_template_overloaded_function_call.substitute(variant=v)
+                                  for v in all_code_variants)
+            code += '    pyRaiseCVOverloadException("{}");\n'.format(self.name)
 
         def_ret = "NULL"
         if self.isconstructor:
@@ -955,7 +973,7 @@ class PythonWrapperGenerator(object):
         if classes:
             classname = normalize_class_name('.'.join(namespace+classes))
             bareclassname = classes[-1]
-        namespace = '.'.join(namespace)
+        namespace_str = '.'.join(namespace)
 
         isconstructor = name == bareclassname
         is_static = False
@@ -980,23 +998,36 @@ class PythonWrapperGenerator(object):
         if is_static:
             # Add it as a method to the class
             func_map = self.classes[classname].methods
-            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace, is_static))
+            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace_str, is_static))
             func.add_variant(self.classes, decl, isphantom)
 
             # Add it as global function
             g_name = "_".join(classes+[name])
-            func_map = self.namespaces.setdefault(namespace, Namespace()).funcs
-            func = func_map.setdefault(g_name, FuncInfo("", g_name, cname, isconstructor, namespace, False))
+            w_classes = []
+            for i in range(0, len(classes)):
+                classes_i = classes[:i+1]
+                classname_i = normalize_class_name('.'.join(namespace+classes_i))
+                w_classname = self.classes[classname_i].wname
+                namespace_prefix = normalize_class_name('.'.join(namespace)) + '_'
+                if w_classname.startswith(namespace_prefix):
+                    w_classname = w_classname[len(namespace_prefix):]
+                w_classes.append(w_classname)
+            g_wname = "_".join(w_classes+[name])
+            func_map = self.namespaces.setdefault(namespace_str, Namespace()).funcs
+            func = func_map.setdefault(g_name, FuncInfo("", g_name, cname, isconstructor, namespace_str, False))
             func.add_variant(self.classes, decl, isphantom)
+            if g_wname != g_name:  # TODO OpenCV 5.0
+                wfunc = func_map.setdefault(g_wname, FuncInfo("", g_wname, cname, isconstructor, namespace_str, False))
+                wfunc.add_variant(self.classes, decl, isphantom)
         else:
             if classname and not isconstructor:
                 if not isphantom:
                     cname = barename
                 func_map = self.classes[classname].methods
             else:
-                func_map = self.namespaces.setdefault(namespace, Namespace()).funcs
+                func_map = self.namespaces.setdefault(namespace_str, Namespace()).funcs
 
-            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace, is_static))
+            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace_str, is_static))
             func.add_variant(self.classes, decl, isphantom)
 
         if classname and isconstructor:
@@ -1012,6 +1043,8 @@ class PythonWrapperGenerator(object):
             if func.isconstructor:
                 continue
             self.code_ns_reg.write(func.get_tab_entry())
+        custom_entries_macro = 'PYOPENCV_EXTRA_METHODS_{}'.format(wname.upper())
+        self.code_ns_reg.write('#ifdef {}\n    {}\n#endif\n'.format(custom_entries_macro, custom_entries_macro))
         self.code_ns_reg.write('    {NULL, NULL}\n};\n\n')
 
         self.code_ns_reg.write('static ConstDef consts_%s[] = {\n'%wname)
@@ -1020,6 +1053,8 @@ class PythonWrapperGenerator(object):
             compat_name = re.sub(r"([a-z])([A-Z])", r"\1_\2", name).upper()
             if name != compat_name:
                 self.code_ns_reg.write('    {"%s", static_cast<long>(%s)},\n'%(compat_name, cname))
+        custom_entries_macro = 'PYOPENCV_EXTRA_CONSTANTS_{}'.format(wname.upper())
+        self.code_ns_reg.write('#ifdef {}\n    {}\n#endif\n'.format(custom_entries_macro, custom_entries_macro))
         self.code_ns_reg.write('    {NULL, 0}\n};\n\n')
 
     def gen_enum_reg(self, enum_name):
@@ -1056,8 +1091,14 @@ class PythonWrapperGenerator(object):
             decls = self.parser.parse(hdr)
             if len(decls) == 0:
                 continue
-            if hdr.find('opencv2/') >= 0: #Avoid including the shadow files
-                self.code_include.write( '#include "{0}"\n'.format(hdr[hdr.rindex('opencv2/'):]) )
+
+            if hdr.find('misc/python/shadow_') < 0:  # Avoid including the "shadow_" files
+                if hdr.find('opencv2/') >= 0:
+                    # put relative path
+                    self.code_include.write('#include "{0}"\n'.format(hdr[hdr.rindex('opencv2/'):]))
+                else:
+                    self.code_include.write('#include "{0}"\n'.format(hdr))
+
             for decl in decls:
                 name = decl[0]
                 if name.startswith("struct") or name.startswith("class"):
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index 088b9ccf17..ace22ee0ee 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -266,6 +266,8 @@ class CppHeaderParser(object):
             l = l.replace("CV_EXPORTS_W_SIMPLE", "")
             modlist.append("/Simple")
         npos = l.find("CV_EXPORTS_AS")
+        if npos < 0:
+            npos = l.find('CV_WRAP_AS')
         if npos >= 0:
             macro_arg, npos3 = self.get_macro_arg(l, npos)
             modlist.append("=" + macro_arg)
@@ -840,6 +842,7 @@ class CppHeaderParser(object):
                     ("GAPI_EXPORTS_W", "CV_EXPORTS_W"),
                     ("GAPI_EXPORTS_W_SIMPLE","CV_EXPORTS_W_SIMPLE"),
                     ("GAPI_WRAP", "CV_WRAP"),
+                    ("GAPI_PROP", "CV_PROP"),
                     ('defined(GAPI_STANDALONE)', '0'),
                 ])
 
@@ -852,7 +855,11 @@ class CppHeaderParser(object):
                     continue
                 state = SCAN
                 l = re.sub(r'//(.+)?', '', l).strip()  # drop // comment
-                if l == '#if 0' or l == '#if defined(__OPENCV_BUILD)' or l == '#ifdef __OPENCV_BUILD':
+                if l in [
+                    '#if 0',
+                    '#if defined(__OPENCV_BUILD)', '#ifdef __OPENCV_BUILD',
+                    '#if !defined(OPENCV_BINDING_PARSER)', '#ifndef OPENCV_BINDING_PARSER',
+                ]:
                     state = DIRECTIVE_IF_0
                     depth_if_0 = 1
                 continue
diff --git a/modules/python/src2/pycompat.hpp b/modules/python/src2/pycompat.hpp
index 054117d625..2650554b3f 100644
--- a/modules/python/src2/pycompat.hpp
+++ b/modules/python/src2/pycompat.hpp
@@ -172,7 +172,7 @@ PyObject* pyopencv_from(const TYPE& src)
 #endif
 
 
-#define CVPY_TYPE_DECLARE(NAME, STORAGE, SNAME) \
+#define CVPY_TYPE_DECLARE(WNAME, NAME, STORAGE, SNAME) \
     struct pyopencv_##NAME##_t \
     { \
         PyObject_HEAD \
@@ -181,7 +181,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyTypeObject pyopencv_##NAME##_TypeXXX = \
     { \
         CVPY_TYPE_HEAD \
-        MODULESTR"."#NAME, \
+        MODULESTR"."#WNAME, \
         sizeof(pyopencv_##NAME##_t), \
     }; \
     static PyTypeObject * pyopencv_##NAME##_TypePtr = &pyopencv_##NAME##_TypeXXX; \
@@ -208,12 +208,12 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##NAME##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "<"#NAME" %p>", self); \
+        sprintf(str, "<"#WNAME" %p>", self); \
         return PyString_FromString(str); \
     }
 
 
-#define CVPY_TYPE_INIT_STATIC(NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
+#define CVPY_TYPE_INIT_STATIC(WNAME, NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
     { \
         pyopencv_##NAME##_TypePtr->tp_base = pyopencv_##BASE##_TypePtr; \
         pyopencv_##NAME##_TypePtr->tp_dealloc = pyopencv_##NAME##_dealloc; \
@@ -229,12 +229,12 @@ PyObject* pyopencv_from(const TYPE& src)
             ERROR_HANDLER; \
         } \
         CVPY_TYPE_INCREF(pyopencv_##NAME##_TypePtr); \
-        PyModule_AddObject(m, #NAME, (PyObject *)pyopencv_##NAME##_TypePtr); \
+        PyModule_AddObject(m, #WNAME, (PyObject *)pyopencv_##NAME##_TypePtr); \
     }
 
 //==================================================================================================
 
-#define CVPY_TYPE_DECLARE_DYNAMIC(NAME, STORAGE, SNAME) \
+#define CVPY_TYPE_DECLARE_DYNAMIC(WNAME, NAME, STORAGE, SNAME) \
     struct pyopencv_##NAME##_t \
     { \
         PyObject_HEAD \
@@ -264,7 +264,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##NAME##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "<"#NAME" %p>", self); \
+        sprintf(str, "<"#WNAME" %p>", self); \
         return PyString_FromString(str); \
     } \
     static PyType_Slot pyopencv_##NAME##_Slots[] =  \
@@ -280,14 +280,14 @@ PyObject* pyopencv_from(const TYPE& src)
     }; \
     static PyType_Spec pyopencv_##NAME##_Spec = \
     { \
-        MODULESTR"."#NAME, \
+        MODULESTR"."#WNAME, \
         sizeof(pyopencv_##NAME##_t), \
         0, \
         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, \
         pyopencv_##NAME##_Slots  \
     };
 
-#define CVPY_TYPE_INIT_DYNAMIC(NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
+#define CVPY_TYPE_INIT_DYNAMIC(WNAME, NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
     { \
         pyopencv_##NAME##_Slots[0].pfunc /*tp_dealloc*/ = (void*)pyopencv_##NAME##_dealloc; \
         pyopencv_##NAME##_Slots[1].pfunc /*tp_repr*/ = (void*)pyopencv_##NAME##_repr; \
@@ -302,7 +302,7 @@ PyObject* pyopencv_from(const TYPE& src)
         pyopencv_##NAME##_TypePtr = PyType_FromSpecWithBases(&pyopencv_##NAME##_Spec, bases); \
         if (!pyopencv_##NAME##_TypePtr) \
         { \
-            printf("Failed to init: " #NAME ", base (" #BASE ")" "\n"); \
+            printf("Failed to init: " #WNAME ", base (" #BASE ")" "\n"); \
             ERROR_HANDLER; \
         } \
         PyModule_AddObject(m, #NAME, (PyObject *)pyopencv_##NAME##_TypePtr); \
diff --git a/modules/python/test/test.py b/modules/python/test/test.py
index 97c2144303..a40c6c5d7e 100755
--- a/modules/python/test/test.py
+++ b/modules/python/test/test.py
@@ -34,8 +34,8 @@ def load_tests(loader, tests, pattern):
     else:
         print('WARNING: OpenCV tests config file ({}) is missing, running subset of tests'.format(config_file))
 
-    tests_pattern = os.environ.get('OPENCV_PYTEST_FILTER', 'test_') + '*.py'
-    if tests_pattern != 'test_*py':
+    tests_pattern = os.environ.get('OPENCV_PYTEST_FILTER', 'test_*') + '.py'
+    if tests_pattern != 'test_*.py':
         print('Tests filter: {}'.format(tests_pattern))
 
     processed = set()
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index 8eba9076a2..4bcd4108f1 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -33,6 +33,8 @@ class cuda_test(NewOpenCVTests):
         self.assertTrue(cuMat.cudaPtr() != 0)
         stream = cv.cuda_Stream()
         self.assertTrue(stream.cudaPtr() != 0)
+        asyncstream = cv.cuda_Stream(1)  # cudaStreamNonBlocking
+        self.assertTrue(asyncstream.cudaPtr() != 0)
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/test_filestorage_io.py b/modules/python/test/test_filestorage_io.py
old mode 100755
new mode 100644
diff --git a/modules/python/test/test_fs_cache_dir.py b/modules/python/test/test_fs_cache_dir.py
new file mode 100644
index 0000000000..6cb40b275e
--- /dev/null
+++ b/modules/python/test/test_fs_cache_dir.py
@@ -0,0 +1,41 @@
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+import os
+import datetime
+
+from tests_common import NewOpenCVTests
+
+class get_cache_dir_test(NewOpenCVTests):
+    def test_get_cache_dir(self):
+        #New binding
+        path = cv.utils.fs.getCacheDirectoryForDownloads()
+        self.assertTrue(os.path.exists(path))
+        self.assertTrue(os.path.isdir(path))
+
+    def get_cache_dir_imread_interop(self, ext):
+        path = cv.utils.fs.getCacheDirectoryForDownloads()
+        gold_image = np.ones((16, 16, 3), np.uint8)
+        read_from_file = np.zeros((16, 16, 3), np.uint8)
+        test_file_name = os.path.join(path, "test." + ext)
+        try:
+            cv.imwrite(test_file_name, gold_image)
+            read_from_file = cv.imread(test_file_name)
+        finally:
+            os.remove(test_file_name)
+
+        self.assertEqual(cv.norm(gold_image, read_from_file), 0)
+
+    def test_get_cache_dir_imread_interop_png(self):
+        self.get_cache_dir_imread_interop("png")
+
+    def test_get_cache_dir_imread_interop_jpeg(self):
+        self.get_cache_dir_imread_interop("jpg")
+
+    def test_get_cache_dir_imread_interop_tiff(self):
+        self.get_cache_dir_imread_interop("tif")
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/test_houghlines.py b/modules/python/test/test_houghlines.py
index 8deae5400f..3ecfbfeaa0 100644
--- a/modules/python/test/test_houghlines.py
+++ b/modules/python/test/test_houghlines.py
@@ -64,6 +64,9 @@ class houghlines_test(NewOpenCVTests):
 
         self.assertGreater(float(matches_counter) / len(testLines), .7)
 
+        lines_acc = cv.HoughLinesWithAccumulator(dst, rho=1, theta=np.pi / 180, threshold=150, srn=0, stn=0)
+        self.assertEqual(lines_acc[0,0,2], 192.0)
+        self.assertEqual(lines_acc[1,0,2], 187.0)
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/test_imgproc.py b/modules/python/test/test_imgproc.py
old mode 100755
new mode 100644
diff --git a/modules/python/test/test_legacy.py b/modules/python/test/test_legacy.py
old mode 100755
new mode 100644
index db6e1c4f38..ab0a8bdc35
--- a/modules/python/test/test_legacy.py
+++ b/modules/python/test/test_legacy.py
@@ -76,7 +76,7 @@ class Hackathon244Tests(NewOpenCVTests):
         mc, mr = cv.minEnclosingCircle(a)
 
         be0 = ((150.2511749267578, 150.77322387695312), (158.024658203125, 197.57696533203125), 37.57804489135742)
-        br0 = ((161.2974090576172, 154.41793823242188), (199.2301483154297, 207.7177734375), -9.164555549621582)
+        br0 = ((161.2974090576172, 154.41793823242188), (207.7177734375, 199.2301483154297), 80.83544921875)
         mc0, mr0 = (160.41790771484375, 144.55152893066406), 136.713500977
 
         self.check_close_boxes(be, be0, 5, 15)
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
old mode 100755
new mode 100644
index f9a350d160..4c98b92833
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -3,6 +3,7 @@ from __future__ import print_function
 
 import ctypes
 from functools import partial
+from collections import namedtuple
 
 import numpy as np
 import cv2 as cv
@@ -46,6 +47,12 @@ class Bindings(NewOpenCVTests):
         boost.getMaxDepth()  # from ml::DTrees
         boost.isClassifier()  # from ml::StatModel
 
+    def test_raiseGeneralException(self):
+        with self.assertRaises((cv.error,),
+                            msg='C++ exception is not propagated to Python in the right way') as cm:
+            cv.utils.testRaiseGeneralException()
+        self.assertEqual(str(cm.exception), 'exception text')
+
     def test_redirectError(self):
         try:
             cv.imshow("", None)  # This causes an assert
@@ -73,6 +80,45 @@ class Bindings(NewOpenCVTests):
         except cv.error as _e:
             pass
 
+    def test_overload_resolution_can_choose_correct_overload(self):
+        val = 123
+        point = (51, 165)
+        self.assertEqual(cv.utils.testOverloadResolution(val, point),
+                         'overload (int={}, point=(x={}, y={}))'.format(val, *point),
+                         "Can't select first overload if all arguments are provided as positional")
+
+        self.assertEqual(cv.utils.testOverloadResolution(val, point=point),
+                         'overload (int={}, point=(x={}, y={}))'.format(val, *point),
+                         "Can't select first overload if one of the arguments are provided as keyword")
+
+        self.assertEqual(cv.utils.testOverloadResolution(val),
+                         'overload (int={}, point=(x=42, y=24))'.format(val),
+                         "Can't select first overload if one of the arguments has default value")
+
+        rect = (1, 5, 10, 23)
+        self.assertEqual(cv.utils.testOverloadResolution(rect),
+                         'overload (rect=(x={}, y={}, w={}, h={}))'.format(*rect),
+                         "Can't select second overload if all arguments are provided")
+
+    def test_overload_resolution_fails(self):
+        def test_overload_resolution(msg, *args, **kwargs):
+            no_exception_msg = 'Overload resolution failed without any exception for: "{}"'.format(msg)
+            wrong_exception_msg = 'Overload resolution failed with wrong exception type for: "{}"'.format(msg)
+            with self.assertRaises((cv.error, Exception), msg=no_exception_msg) as cm:
+                res = cv.utils.testOverloadResolution(*args, **kwargs)
+                self.fail("Unexpected result for {}: '{}'".format(msg, res))
+            self.assertEqual(type(cm.exception), cv.error, wrong_exception_msg)
+
+        test_overload_resolution('wrong second arg type (keyword arg)', 5, point=(1, 2, 3))
+        test_overload_resolution('wrong second arg type', 5, 2)
+        test_overload_resolution('wrong first arg', 3.4, (12, 21))
+        test_overload_resolution('wrong first arg, no second arg', 4.5)
+        test_overload_resolution('wrong args number for first overload', 3, (12, 21), 123)
+        test_overload_resolution('wrong args number for second overload', (3, 12, 12, 1), (12, 21))
+        # One of the common problems
+        test_overload_resolution('rect with float coordinates', (4.5, 4, 2, 1))
+        test_overload_resolution('rect with wrong number of coordinates', (4, 4, 1))
+
 
 class Arguments(NewOpenCVTests):
 
@@ -314,7 +360,7 @@ class Arguments(NewOpenCVTests):
 
     def test_parse_to_cstring_convertible(self):
         try_to_convert = partial(self._try_to_convert, cv.utils.dumpCString)
-        for convertible in ('s', 'str', str(123), ('char'), np.str('test1'), np.str_('test2')):
+        for convertible in ('', 's', 'str', str(123), ('char'), np.str('test1'), np.str_('test2')):
             expected = 'string: ' + convertible
             actual = try_to_convert(convertible)
             self.assertEqual(expected, actual,
@@ -326,6 +372,98 @@ class Arguments(NewOpenCVTests):
             with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
                 _ = cv.utils.dumpCString(not_convertible)
 
+    def test_parse_to_string_convertible(self):
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpString)
+        for convertible in (None, '', 's', 'str', str(123), np.str('test1'), np.str_('test2')):
+            expected = 'string: ' + (convertible if convertible else '')
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_string_not_convertible(self):
+        for not_convertible in ((12,), ('t', 'e', 's', 't'), np.array(['123', ]),
+                                np.array(['t', 'e', 's', 't']), 1, True, False):
+            with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
+                _ = cv.utils.dumpString(not_convertible)
+
+    def test_parse_to_rect_convertible(self):
+        Rect = namedtuple('Rect', ('x', 'y', 'w', 'h'))
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpRect)
+        for convertible in ((1, 2, 4, 5), [5, 3, 10, 20], np.array([10, 20, 23, 10]),
+                            Rect(10, 30, 40, 55), tuple(np.array([40, 20, 24, 20])),
+                            list(np.array([20, 40, 30, 35]))):
+            expected = 'rect: (x={}, y={}, w={}, h={})'.format(*convertible)
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_rect_not_convertible(self):
+        for not_convertible in (np.empty(shape=(4, 1)), (), [], np.array([]), (12, ),
+                                [3, 4, 5, 10, 123], {1: 2, 3:4, 5:10, 6:30},
+                                '1234', np.array([1, 2, 3, 4], dtype=np.float32),
+                                np.array([[1, 2], [3, 4], [5, 6], [6, 8]]), (1, 2, 5, 1.5)):
+            with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
+                _ = cv.utils.dumpRect(not_convertible)
+
+    def test_parse_to_rotated_rect_convertible(self):
+        RotatedRect = namedtuple('RotatedRect', ('center', 'size', 'angle'))
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpRotatedRect)
+        for convertible in (((2.5, 2.5), (10., 20.), 12.5), [[1.5, 10.5], (12.5, 51.5), 10],
+                            RotatedRect((10, 40), np.array([10.5, 20.5]), 5),
+                            np.array([[10, 6], [50, 50], 5.5], dtype=object)):
+            center, size, angle = convertible
+            expected = 'rotated_rect: (c_x={:.6f}, c_y={:.6f}, w={:.6f},' \
+                       ' h={:.6f}, a={:.6f})'.format(center[0], center[1],
+                                                     size[0], size[1], angle)
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_rotated_rect_not_convertible(self):
+        for not_convertible in ([], (), np.array([]), (123, (45, 34), 1), {1: 2, 3: 4}, 123,
+                                np.array([[123, 123, 14], [1, 3], 56], dtype=object), '123'):
+            with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
+                _ = cv.utils.dumpRotatedRect(not_convertible)
+
+    def test_parse_to_term_criteria_convertible(self):
+        TermCriteria = namedtuple('TermCriteria', ('type', 'max_count', 'epsilon'))
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpTermCriteria)
+        for convertible in ((1, 10, 1e-3), [2, 30, 1e-1], np.array([10, 20, 0.5], dtype=object),
+                            TermCriteria(0, 5, 0.1)):
+            expected = 'term_criteria: (type={}, max_count={}, epsilon={:.6f}'.format(*convertible)
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_term_criteria_not_convertible(self):
+        for not_convertible in ([], (), np.array([]), [1, 4], (10,), (1.5, 34, 0.1),
+                                {1: 5, 3: 5, 10: 10}, '145'):
+            with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
+                _ = cv.utils.dumpTermCriteria(not_convertible)
+
+    def test_parse_to_range_convertible_to_all(self):
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpRange)
+        for convertible in ((), [], np.array([])):
+            expected = 'range: all'
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_range_convertible(self):
+        Range = namedtuple('Range', ('start', 'end'))
+        try_to_convert = partial(self._try_to_convert, cv.utils.dumpRange)
+        for convertible in ((10, 20), [-1, 3], np.array([10, 24]), Range(-4, 6)):
+            expected = 'range: (s={}, e={})'.format(*convertible)
+            actual = try_to_convert(convertible)
+            self.assertEqual(expected, actual,
+                             msg=get_conversion_error_msg(convertible, expected, actual))
+
+    def test_parse_to_range_not_convertible(self):
+        for not_convertible in ((1, ), [40, ], np.array([1, 4, 6]), {'a': 1, 'b': 40},
+                                (1.5, 13.5), [3, 6.7], np.array([6.3, 2.1]), '14, 4'):
+            with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
+                _ = cv.utils.dumpRange(not_convertible)
+
 
 class SamplesFindFile(NewOpenCVTests):
 
diff --git a/modules/stereo/include/opencv2/stereo.hpp b/modules/stereo/include/opencv2/stereo.hpp
index 984ba4a6a8..665043ae43 100644
--- a/modules/stereo/include/opencv2/stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo.hpp
@@ -48,7 +48,7 @@ rectified first camera's image.
 camera, i.e. it projects points given in the rectified first camera coordinate system into the
 rectified second camera's image.
 @param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see @ref reprojectImageTo3D).
-@param flags Operation flags that may be zero or STEREO_ZERO_DISPARITY . If the flag is set,
+@param flags Operation flags that may be zero or @ref STEREO_ZERO_DISPARITY . If the flag is set,
 the function makes the principal points of each camera have the same pixel coordinates in the
 rectified views. And if the flag is not set, the function may still shift the images in the
 horizontal or vertical direction (depending on the orientation of epipolar lines) to maximize the
@@ -95,7 +95,7 @@ coordinates. The function distinguishes the following two cases:
                      \end{bmatrix} ,\f]
 
     where \f$T_x\f$ is a horizontal shift between the cameras and \f$cx_1=cx_2\f$ if
-    STEREO_ZERO_DISPARITY is set.
+    @ref STEREO_ZERO_DISPARITY is set.
 
 -   **Vertical stereo**: the first and the second camera views are shifted relative to each other
     mainly in the vertical direction (and probably a bit in the horizontal direction too). The epipolar
@@ -114,7 +114,7 @@ coordinates. The function distinguishes the following two cases:
                      \end{bmatrix},\f]
 
     where \f$T_y\f$ is a vertical shift between the cameras and \f$cy_1=cy_2\f$ if
-    STEREO_ZERO_DISPARITY is set.
+    @ref STEREO_ZERO_DISPARITY is set.
 
 As you can see, the first three columns of P1 and P2 will effectively be the new "rectified" camera
 matrices. The matrices, together with R1 and R2 , can then be passed to initUndistortRectifyMap to
diff --git a/modules/stereo/src/stereobm.cpp b/modules/stereo/src/stereobm.cpp
index b8a8c4a157..7571fdc004 100644
--- a/modules/stereo/src/stereobm.cpp
+++ b/modules/stereo/src/stereobm.cpp
@@ -1147,13 +1147,15 @@ class StereoBMImpl CV_FINAL : public StereoBM
 {
 public:
     StereoBMImpl()
+        : params()
     {
-        params = StereoBMParams();
+        // nothing
     }
 
     StereoBMImpl( int _numDisparities, int _SADWindowSize )
+        : params(_numDisparities, _SADWindowSize)
     {
-        params = StereoBMParams(_numDisparities, _SADWindowSize);
+        // nothing
     }
 
     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
diff --git a/modules/stereo/src/stereosgbm.cpp b/modules/stereo/src/stereosgbm.cpp
index 786f216e0f..2eb48d705f 100644
--- a/modules/stereo/src/stereosgbm.cpp
+++ b/modules/stereo/src/stereosgbm.cpp
@@ -2185,19 +2185,21 @@ class StereoSGBMImpl CV_FINAL : public StereoSGBM
 {
 public:
     StereoSGBMImpl()
+        : params()
     {
-        params = StereoSGBMParams();
+        // nothing
     }
 
     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
                     int _mode )
+        : params(_minDisparity, _numDisparities, _SADWindowSize,
+                 _P1, _P2, _disp12MaxDiff, _preFilterCap,
+                 _uniquenessRatio, _speckleWindowSize, _speckleRange,
+                 _mode)
     {
-        params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
-                                   _P1, _P2, _disp12MaxDiff, _preFilterCap,
-                                   _uniquenessRatio, _speckleWindowSize, _speckleRange,
-                                   _mode );
+        // nothing
     }
 
     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
diff --git a/modules/stitching/include/opencv2/stitching.hpp b/modules/stitching/include/opencv2/stitching.hpp
index f6e7f70172..fb0ebe929d 100644
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -259,6 +259,20 @@ public:
      */
     CV_WRAP Status estimateTransform(InputArrayOfArrays images, InputArrayOfArrays masks = noArray());
 
+    /** @brief These function restors camera rotation and camera intrinsics of each camera
+     *  that can be got with @ref Stitcher::cameras call
+
+    @param images Input images.
+    @param cameras Estimated rotation of cameras for each of the input images.
+    @param component Indices (0-based) of images constituting the final panorama (optional).
+    @return Status code.
+     */
+    Status setTransform(InputArrayOfArrays images,
+                        const std::vector<detail::CameraParams> &cameras,
+                        const std::vector<int> &component);
+    /** @overload */
+    Status setTransform(InputArrayOfArrays images, const std::vector<detail::CameraParams> &cameras);
+
     /** @overload */
     CV_WRAP Status composePanorama(OutputArray pano);
     /** @brief These functions try to compose the given images (or images stored internally from the other function
diff --git a/modules/stitching/perf/opencl/perf_stitch.cpp b/modules/stitching/perf/opencl/perf_stitch.cpp
index 1b6e43304a..4eccf8d50b 100644
--- a/modules/stitching/perf/opencl/perf_stitch.cpp
+++ b/modules/stitching/perf/opencl/perf_stitch.cpp
@@ -19,7 +19,7 @@ namespace ocl {
 
 typedef TestBaseWithParam<string> stitch;
 
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
 #define TEST_DETECTORS testing::Values("surf", "orb", "akaze")
 #else
 #define TEST_DETECTORS testing::Values("orb", "akaze")
diff --git a/modules/stitching/perf/perf_estimators.cpp b/modules/stitching/perf/perf_estimators.cpp
index dd1d44e077..9e8f9e3962 100644
--- a/modules/stitching/perf/perf_estimators.cpp
+++ b/modules/stitching/perf/perf_estimators.cpp
@@ -8,7 +8,7 @@ using namespace perf;
 
 typedef TestBaseWithParam<tuple<string, string> > bundleAdjuster;
 
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
 #define TEST_DETECTORS testing::Values("surf", "orb")
 #else
 #define TEST_DETECTORS testing::Values<string>("orb")
diff --git a/modules/stitching/perf/perf_matchers.cpp b/modules/stitching/perf/perf_matchers.cpp
index f0fb70630c..975d3395fd 100644
--- a/modules/stitching/perf/perf_matchers.cpp
+++ b/modules/stitching/perf/perf_matchers.cpp
@@ -17,7 +17,7 @@ typedef TestBaseWithParam<matchVector_t> matchVector;
 #define ORB_MATCH_CONFIDENCE  0.3f
 #define WORK_MEGAPIX 0.6
 
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
 #define TEST_DETECTORS testing::Values("surf", "orb")
 #else
 #define TEST_DETECTORS testing::Values<string>("orb")
diff --git a/modules/stitching/perf/perf_precomp.hpp b/modules/stitching/perf/perf_precomp.hpp
index cb89f16eef..45997a3bd9 100644
--- a/modules/stitching/perf/perf_precomp.hpp
+++ b/modules/stitching/perf/perf_precomp.hpp
@@ -15,7 +15,7 @@ static inline Ptr<Feature2D> getFeatureFinder(const std::string& name)
 {
     if (name == "orb")
         return ORB::create();
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
     else if (name == "surf")
         return xfeatures2d::SURF::create();
 #endif
diff --git a/modules/stitching/perf/perf_stich.cpp b/modules/stitching/perf/perf_stich.cpp
index 24c000dc91..d8b96974dd 100644
--- a/modules/stitching/perf/perf_stich.cpp
+++ b/modules/stitching/perf/perf_stich.cpp
@@ -17,7 +17,7 @@ typedef TestBaseWithParam<int> stitchExposureCompensation;
 typedef TestBaseWithParam<tuple<string, string> > stitchDatasets;
 typedef TestBaseWithParam<tuple<string, int>> stitchExposureCompMultiFeed;
 
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
 #define TEST_DETECTORS testing::Values("surf", "orb", "akaze")
 #else
 #define TEST_DETECTORS testing::Values("orb", "akaze")
diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp
index df2b8779bb..59542d95ba 100644
--- a/modules/stitching/src/exposure_compensate.cpp
+++ b/modules/stitching/src/exposure_compensate.cpp
@@ -154,7 +154,11 @@ void GainCompensator::singleFeed(const std::vector<Point> &corners, const std::v
                 {
                     CV_Assert(similarity_it != similarities_.end());
                     UMat similarity = *similarity_it++;
-                    bitwise_and(intersect, similarity, intersect);
+                    // in-place operation has an issue. don't remove the swap
+                    // detail https://github.com/opencv/opencv/issues/19184
+                    Mat_<uchar> intersect_updated;
+                    bitwise_and(intersect, similarity, intersect_updated);
+                    std::swap(intersect, intersect_updated);
                 }
 
                 int intersect_count = countNonZero(intersect);
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index 58aa1882f6..f9ba60c1b9 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -118,7 +118,6 @@ Stitcher::Status Stitcher::estimateTransform(InputArrayOfArrays images, InputArr
 }
 
 
-
 Stitcher::Status Stitcher::composePanorama(OutputArray pano)
 {
     CV_INSTRUMENT_REGION();
@@ -541,6 +540,103 @@ Stitcher::Status Stitcher::estimateCameraParams()
     return OK;
 }
 
+Stitcher::Status Stitcher::setTransform(InputArrayOfArrays images, const std::vector<detail::CameraParams> &cameras)
+{
+    std::vector<int> component;
+    for (int i = 0; i < (int)images.total(); i++)
+        component.push_back(i);
+
+    return setTransform(images, cameras, component);
+}
+
+
+Stitcher::Status Stitcher::setTransform(
+        InputArrayOfArrays images, const std::vector<detail::CameraParams> &cameras, const std::vector<int> &component)
+{
+//    CV_Assert(images.size() == cameras.size());
+
+    images.getUMatVector(imgs_);
+    masks_.clear();
+
+    if ((int)imgs_.size() < 2)
+    {
+        LOGLN("Need more images");
+        return ERR_NEED_MORE_IMGS;
+    }
+
+    work_scale_ = 1;
+    seam_work_aspect_ = 1;
+    seam_scale_ = 1;
+    bool is_work_scale_set = false;
+    bool is_seam_scale_set = false;
+    seam_est_imgs_.resize(imgs_.size());
+    full_img_sizes_.resize(imgs_.size());
+
+
+    for (size_t i = 0; i < imgs_.size(); ++i)
+    {
+        full_img_sizes_[i] = imgs_[i].size();
+        if (registr_resol_ < 0)
+        {
+            work_scale_ = 1;
+            is_work_scale_set = true;
+        }
+        else
+        {
+            if (!is_work_scale_set)
+            {
+                work_scale_ = std::min(1.0, std::sqrt(registr_resol_ * 1e6 / full_img_sizes_[i].area()));
+                is_work_scale_set = true;
+            }
+        }
+        if (!is_seam_scale_set)
+        {
+            seam_scale_ = std::min(1.0, std::sqrt(seam_est_resol_ * 1e6 / full_img_sizes_[i].area()));
+            seam_work_aspect_ = seam_scale_ / work_scale_;
+            is_seam_scale_set = true;
+        }
+
+        resize(imgs_[i], seam_est_imgs_[i], Size(), seam_scale_, seam_scale_, INTER_LINEAR_EXACT);
+    }
+
+    features_.clear();
+    pairwise_matches_.clear();
+
+    indices_ = component;
+    std::vector<UMat> seam_est_imgs_subset;
+    std::vector<UMat> imgs_subset;
+    std::vector<Size> full_img_sizes_subset;
+    for (size_t i = 0; i < indices_.size(); ++i)
+    {
+        imgs_subset.push_back(imgs_[indices_[i]]);
+        seam_est_imgs_subset.push_back(seam_est_imgs_[indices_[i]]);
+        full_img_sizes_subset.push_back(full_img_sizes_[indices_[i]]);
+    }
+    seam_est_imgs_ = seam_est_imgs_subset;
+    imgs_ = imgs_subset;
+    full_img_sizes_ = full_img_sizes_subset;
+
+    if ((int)imgs_.size() < 2)
+    {
+        LOGLN("Need more images");
+        return ERR_NEED_MORE_IMGS;
+    }
+
+    cameras_ = cameras;
+
+    std::vector<double> focals;
+    for (size_t i = 0; i < cameras.size(); ++i)
+        focals.push_back(cameras_[i].focal);
+
+    std::sort(focals.begin(), focals.end());
+    if (focals.size() % 2 == 1)
+        warped_image_scale_ = static_cast<float>(focals[focals.size() / 2]);
+    else
+        warped_image_scale_ = static_cast<float>(focals[focals.size() / 2 - 1] + focals[focals.size() / 2]) * 0.5f;
+
+    return Status::OK;
+}
+
 
 CV_DEPRECATED Ptr<Stitcher> createStitcher(bool /*ignored*/)
 {
diff --git a/modules/stitching/test/test_matchers.cpp b/modules/stitching/test/test_matchers.cpp
index 57113e2942..08c5aa56db 100644
--- a/modules/stitching/test/test_matchers.cpp
+++ b/modules/stitching/test/test_matchers.cpp
@@ -43,7 +43,7 @@
 
 namespace opencv_test { namespace {
 
-#ifdef HAVE_OPENCV_XFEATURES2D
+#if defined(HAVE_OPENCV_XFEATURES2D) && defined(OPENCV_ENABLE_NONFREE)
 
 TEST(SurfFeaturesFinder, CanFindInROIs)
 {
@@ -80,7 +80,7 @@ TEST(SurfFeaturesFinder, CanFindInROIs)
     EXPECT_EQ(bad_count, 0);
 }
 
-#endif // HAVE_OPENCV_XFEATURES2D
+#endif // HAVE_OPENCV_XFEATURES2D && OPENCV_ENABLE_NONFREE
 
 TEST(ParallelFeaturesFinder, IsSameWithSerial)
 {
diff --git a/modules/stitching/test/test_stitcher.cpp b/modules/stitching/test/test_stitcher.cpp
new file mode 100644
index 0000000000..5ca4cca2f1
--- /dev/null
+++ b/modules/stitching/test/test_stitcher.cpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(ImageStitcher, setTransform)
+{
+    vector<Mat> images;
+    images.push_back(imread(string(cvtest::TS::ptr()->get_data_path()) + "stitching/s1.jpg"));
+    images.push_back(imread(string(cvtest::TS::ptr()->get_data_path()) + "stitching/s2.jpg"));
+
+    Mat expected;
+    Ptr<Stitcher> stitcher = Stitcher::create(Stitcher::PANORAMA);
+    EXPECT_TRUE(Stitcher::OK == stitcher->estimateTransform(images));
+    EXPECT_TRUE(Stitcher::OK == stitcher->composePanorama(expected));
+
+    Mat result;
+    Ptr<Stitcher> another_stitcher = Stitcher::create(Stitcher::PANORAMA);
+    EXPECT_TRUE(Stitcher::OK == another_stitcher->setTransform(images, stitcher->cameras()));
+    EXPECT_TRUE(Stitcher::OK == another_stitcher->composePanorama(result));
+
+    EXPECT_DOUBLE_EQ(cvtest::norm(expected, result, NORM_INF), .0);
+}
+
+}} // namespace opencv_test
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index b2a4cac241..5c09b569a5 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -13,6 +13,7 @@ void checkIppStatus();
 extern bool skipUnstableTests;
 extern bool runBigDataTests;
 extern int testThreads;
+extern int debugLevel;  //< 0 - no debug, 1 - basic test debug information, >1 - extra debug information
 
 void testSetUp();
 void testTearDown();
diff --git a/modules/ts/misc/run_long.py b/modules/ts/misc/run_long.py
index 7870592063..e49b5df743 100755
--- a/modules/ts/misc/run_long.py
+++ b/modules/ts/misc/run_long.py
@@ -46,6 +46,8 @@ LONG_TESTS_DEBUG_VALGRIND = [
     ('tracking', '*DistanceAndOverlap*/2', 1000.0), # faceocc2
     ('videoio', 'videoio/videoio_ffmpeg.write_big*', 1000),
     ('videoio', 'videoio_ffmpeg.parallel', 1000),
+    ('videoio', '*videocapture_acceleration*', 1000), # valgrind can't track HW buffers: Conditional jump or move depends on uninitialised value(s)
+    ('videoio', '*videowriter_acceleration*', 1000), # valgrind crash: set_mempolicy: Operation not permitted
     ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_BoostDesc_LBGM.regression', 1124.51),
     ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_VGG120.regression', 2198.1),
     ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_VGG48.regression', 1958.52),
diff --git a/modules/ts/misc/summary.py b/modules/ts/misc/summary.py
index 5549b6c6dc..9da1fb60c6 100755
--- a/modules/ts/misc/summary.py
+++ b/modules/ts/misc/summary.py
@@ -30,7 +30,7 @@ if __name__ == "__main__":
         exit(0)
 
     parser = OptionParser()
-    parser.add_option("-o", "--output", dest="format", help="output results in text format (can be 'txt', 'html', 'markdown' or 'auto' - default)", metavar="FMT", default="auto")
+    parser.add_option("-o", "--output", dest="format", help="output results in text format (can be 'txt', 'html', 'markdown', 'tabs' or 'auto' - default)", metavar="FMT", default="auto")
     parser.add_option("-m", "--metric", dest="metric", help="output metric", metavar="NAME", default="gmean")
     parser.add_option("-u", "--units", dest="units", help="units for output values (s, ms (default), us, ns or ticks)", metavar="UNITS", default="ms")
     parser.add_option("-f", "--filter", dest="filter", help="regex to filter tests", metavar="REGEX", default=None)
diff --git a/modules/ts/misc/table_formatter.py b/modules/ts/misc/table_formatter.py
index d683394364..412936950f 100755
--- a/modules/ts/misc/table_formatter.py
+++ b/modules/ts/misc/table_formatter.py
@@ -38,6 +38,7 @@ class table(object):
     def __init__(self, caption = None, format=None):
         self.format = format
         self.is_markdown = self.format == 'markdown'
+        self.is_tabs = self.format == 'tabs'
         self.columns = {}
         self.rows = []
         self.ridx = -1;
@@ -253,7 +254,7 @@ class table(object):
 
     def consolePrintTable(self, out):
         columns = self.layoutTable()
-        colrizer = getColorizer(out) if not self.is_markdown else dummyColorizer(out)
+        colrizer = getColorizer(out) if not (self.is_markdown or self.is_tabs) else dummyColorizer(out)
 
         if self.caption:
             out.write("%s%s%s" % ( os.linesep,  os.linesep.join(self.reformatTextValue(self.caption)), os.linesep * 2))
@@ -299,6 +300,10 @@ class table(object):
                 text = ' '.join(self.getValue('text', c) or [])
                 out.write(text + "|")
             out.write(os.linesep)
+        elif self.is_tabs:
+            cols_to_join=[' '.join(self.getValue('text', c) or []) for c in row.cells]
+            out.write('\t'.join(cols_to_join))
+            out.write(os.linesep)
         else:
             for ln in range(row.minheight):
                 i = 0
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index 7f4fdd978a..061816d810 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -774,6 +774,7 @@ static bool checkTestData = cv::utils::getConfigurationParameterBool("OPENCV_TES
 bool skipUnstableTests = false;
 bool runBigDataTests = false;
 int testThreads = 0;
+int debugLevel = (int)cv::utils::getConfigurationParameterSizeT("OPENCV_TEST_DEBUG", 0);
 
 
 static size_t memory_usage_base = 0;
@@ -883,6 +884,7 @@ void parseCustomOptions(int argc, char **argv)
         "{ test_threads       |-1       |the number of worker threads, if parallel execution is enabled}"
         "{ skip_unstable      |false    |skip unstable tests }"
         "{ test_bigdata       |false    |run BigData tests (>=2Gb) }"
+        "{ test_debug         |         |0 - no debug (default), 1 - basic test debug information, >1 - extra debug information }"
         "{ test_require_data  |") + (checkTestData ? "true" : "false") + string("|fail on missing non-required test data instead of skip (env:OPENCV_TEST_REQUIRE_DATA)}"
         CV_TEST_TAGS_PARAMS
         "{ h   help           |false    |print help info                          }"
@@ -909,6 +911,14 @@ void parseCustomOptions(int argc, char **argv)
 
     skipUnstableTests = parser.get<bool>("skip_unstable");
     runBigDataTests = parser.get<bool>("test_bigdata");
+    if (parser.has("test_debug"))
+    {
+        cv::String s = parser.get<cv::String>("test_debug");
+        if (s.empty() || s == "true")
+            debugLevel = 1;
+        else
+            debugLevel = parser.get<int>("test_debug");
+    }
     if (parser.has("test_require_data"))
         checkTestData = parser.get<bool>("test_require_data");
 
@@ -1122,7 +1132,9 @@ void SystemInfoCollector::OnTestProgramStart(const testing::UnitTest&)
     }
     recordPropertyVerbose("cv_cpu_features", "CPU features", cv::getCPUFeaturesLine());
 #ifdef HAVE_IPP
-    recordPropertyVerbose("cv_ipp_version", "Intel(R) IPP version", cv::ipp::useIPP() ? cv::ipp::getIppVersion() :  "disabled");
+    recordPropertyVerbose("cv_ipp_version", "Intel(R) IPP version", cv::ipp::useIPP() ? cv::ipp::getIppVersion() : "disabled");
+    if (cv::ipp::useIPP())
+        recordPropertyVerbose("cv_ipp_features", "Intel(R) IPP features code", cv::format("0x%llx", cv::ipp::getIppTopFeatures()));
 #endif
 #ifdef HAVE_OPENCL
     cv::dumpOpenCLInformation();
diff --git a/modules/video/include/opencv2/video/detail/tracking.private.hpp b/modules/video/include/opencv2/video/detail/tracking.detail.hpp
similarity index 100%
rename from modules/video/include/opencv2/video/detail/tracking.private.hpp
rename to modules/video/include/opencv2/video/detail/tracking.detail.hpp
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index b44f6855f8..af35aaa4e7 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -339,7 +339,7 @@ CV_EXPORTS_W double findTransformECC( InputArray templateImage, InputArray input
                                       InputArray inputMask, int gaussFiltSize);
 
 /** @overload */
-CV_EXPORTS
+CV_EXPORTS_W
 double findTransformECC(InputArray templateImage, InputArray inputImage,
     InputOutputArray warpMatrix, int motionType = MOTION_AFFINE,
     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 50, 0.001),
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index fd27c9288a..824159b6ff 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -865,6 +865,8 @@ namespace
                           OutputArray status,
                           OutputArray err = cv::noArray()) CV_OVERRIDE;
 
+        virtual String getDefaultName() const CV_OVERRIDE { return "SparseOpticalFlow.SparsePyrLKOpticalFlow"; }
+
     private:
 #ifdef HAVE_OPENCL
         bool checkParam()
@@ -934,7 +936,8 @@ namespace
             {
                 if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
                                   nextPts, status, err,
-                                  prevPts.cols, level))
+                                  static_cast<int>(prevPts.total()),
+                                  level))
                     return false;
             }
             return true;
diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp
index 83ad47fc0e..2b164b62d3 100644
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -618,6 +618,8 @@ public:
 
     virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
 
+    virtual String getDefaultName() const CV_OVERRIDE { return "DenseOpticalFlow.FarnebackOpticalFlow"; }
+
 private:
     int numLevels_;
     double pyrScale_;
diff --git a/modules/video/src/tracking/detail/tracker_feature.cpp b/modules/video/src/tracking/detail/tracker_feature.cpp
index 47651f6657..b68994dfe2 100644
--- a/modules/video/src/tracking/detail/tracker_feature.cpp
+++ b/modules/video/src/tracking/detail/tracker_feature.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_feature_haar.impl.hpp b/modules/video/src/tracking/detail/tracker_feature_haar.impl.hpp
index 6590abf34f..30a3eb3acd 100644
--- a/modules/video/src/tracking/detail/tracker_feature_haar.impl.hpp
+++ b/modules/video/src/tracking/detail/tracker_feature_haar.impl.hpp
@@ -3,8 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
-#include "opencv2/video/detail/tracking_feature.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
+#include "tracking_feature.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_feature_set.cpp b/modules/video/src/tracking/detail/tracker_feature_set.cpp
index 43f3203c52..96eb9981f9 100644
--- a/modules/video/src/tracking/detail/tracker_feature_set.cpp
+++ b/modules/video/src/tracking/detail/tracker_feature_set.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_mil_model.hpp b/modules/video/src/tracking/detail/tracker_mil_model.hpp
index 04d9176298..dddfae5536 100644
--- a/modules/video/src/tracking/detail/tracker_mil_model.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_model.hpp
@@ -5,7 +5,7 @@
 #ifndef __OPENCV_TRACKER_MIL_MODEL_HPP__
 #define __OPENCV_TRACKER_MIL_MODEL_HPP__
 
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 #include "tracker_mil_state.hpp"
 
 namespace cv {
diff --git a/modules/video/src/tracking/detail/tracker_mil_state.cpp b/modules/video/src/tracking/detail/tracker_mil_state.cpp
index 63591382b0..b3de09c880 100644
--- a/modules/video/src/tracking/detail/tracker_mil_state.cpp
+++ b/modules/video/src/tracking/detail/tracker_mil_state.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 #include "tracker_mil_state.hpp"
 
 namespace cv {
diff --git a/modules/video/src/tracking/detail/tracker_mil_state.hpp b/modules/video/src/tracking/detail/tracker_mil_state.hpp
index e78b19dec2..12af1c33df 100644
--- a/modules/video/src/tracking/detail/tracker_mil_state.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_state.hpp
@@ -5,7 +5,7 @@
 #ifndef OPENCV_VIDEO_DETAIL_TRACKING_MIL_STATE_HPP
 #define OPENCV_VIDEO_DETAIL_TRACKING_MIL_STATE_HPP
 
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 #include "tracking_online_mil.hpp"
 
 namespace cv {
diff --git a/modules/video/src/tracking/detail/tracker_model.cpp b/modules/video/src/tracking/detail/tracker_model.cpp
index c9ea424aaf..d06e5515bd 100644
--- a/modules/video/src/tracking/detail/tracker_model.cpp
+++ b/modules/video/src/tracking/detail/tracker_model.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_sampler.cpp b/modules/video/src/tracking/detail/tracker_sampler.cpp
index ec11656958..4e73ea3734 100644
--- a/modules/video/src/tracking/detail/tracker_sampler.cpp
+++ b/modules/video/src/tracking/detail/tracker_sampler.cpp
@@ -4,7 +4,7 @@
 
 #include "../../precomp.hpp"
 
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_sampler_algorithm.cpp b/modules/video/src/tracking/detail/tracker_sampler_algorithm.cpp
index b5eb285e1a..ba5ae41b80 100644
--- a/modules/video/src/tracking/detail/tracker_sampler_algorithm.cpp
+++ b/modules/video/src/tracking/detail/tracker_sampler_algorithm.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracker_state_estimator.cpp b/modules/video/src/tracking/detail/tracker_state_estimator.cpp
index 2410b5b076..ea48a739d8 100644
--- a/modules/video/src/tracking/detail/tracker_state_estimator.cpp
+++ b/modules/video/src/tracking/detail/tracker_state_estimator.cpp
@@ -3,7 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/src/tracking/detail/tracking_feature.cpp b/modules/video/src/tracking/detail/tracking_feature.cpp
index 1850995fee..0d9a88c1a5 100644
--- a/modules/video/src/tracking/detail/tracking_feature.cpp
+++ b/modules/video/src/tracking/detail/tracking_feature.cpp
@@ -3,8 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../../precomp.hpp"
-#include "opencv2/video/detail/tracking.private.hpp"
-#include "opencv2/video/detail/tracking_feature.private.hpp"
+#include "opencv2/video/detail/tracking.detail.hpp"
+#include "tracking_feature.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/video/include/opencv2/video/detail/tracking_feature.private.hpp b/modules/video/src/tracking/detail/tracking_feature.hpp
similarity index 100%
rename from modules/video/include/opencv2/video/detail/tracking_feature.private.hpp
rename to modules/video/src/tracking/detail/tracking_feature.hpp
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index a31d969ab8..3a79631b86 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(VIDEOIO_PLUGIN_LIST "" CACHE STRING "List of videoio backends to be compiled as plugins (ffmpeg, gstreamer, mfx, msmf or special value 'all')")
 set(VIDEOIO_ENABLE_PLUGINS "ON" CACHE BOOL "Allow building videoio plugin support")
-set(VIDEOIO_ENABLE_STRICT_PLUGIN_CHECK "ON" CACHE BOOL "Make sure OpenCV version is the same in plugin and host code")
-mark_as_advanced(VIDEOIO_PLUGIN_LIST VIDEOIO_ENABLE_PLUGINS VIDEOIO_ENABLE_STRICT_PLUGIN_CHECK)
+mark_as_advanced(VIDEOIO_PLUGIN_LIST VIDEOIO_ENABLE_PLUGINS)
 
 string(REPLACE "," ";" VIDEOIO_PLUGIN_LIST "${VIDEOIO_PLUGIN_LIST}")  # support comma-separated list (,) too
 
@@ -202,6 +201,12 @@ if(TARGET ocv.3rdparty.android_mediandk)
   list(APPEND tgts ocv.3rdparty.android_mediandk)
 endif()
 
+if(TARGET ocv.3rdparty.android_native_camera)
+  list(APPEND videoio_srcs
+    ${CMAKE_CURRENT_LIST_DIR}/src/cap_android_camera.cpp)
+  list(APPEND tgts ocv.3rdparty.android_native_camera)
+endif()
+
 ocv_set_module_sources(HEADERS ${videoio_ext_hdrs} ${videoio_hdrs} SOURCES ${videoio_srcs})
 ocv_module_include_directories()
 ocv_create_module()
@@ -212,10 +217,6 @@ if(VIDEOIO_ENABLE_PLUGINS)
   ocv_target_compile_definitions(${the_module} PRIVATE ENABLE_PLUGINS)
 endif()
 
-if(VIDEOIO_ENABLE_STRICT_PLUGIN_CHECK)
-  ocv_target_compile_definitions(${the_module} PRIVATE STRICT_PLUGIN_CHECK)
-endif()
-
 ocv_target_link_libraries(${the_module} LINK_PRIVATE ${tgts})
 
 # copy FFmpeg dll to the output folder
diff --git a/modules/videoio/cmake/detect_android_camera.cmake b/modules/videoio/cmake/detect_android_camera.cmake
new file mode 100644
index 0000000000..ded4c91ccf
--- /dev/null
+++ b/modules/videoio/cmake/detect_android_camera.cmake
@@ -0,0 +1,8 @@
+# if(ANDROID AND ANDROID_NATIVE_API_LEVEL GREATER_EQUAL 24)  <-- would be nicer but requires CMake 3.7 or later
+if(ANDROID AND ANDROID_NATIVE_API_LEVEL GREATER 23)
+  set(HAVE_ANDROID_NATIVE_CAMERA TRUE)
+  set(libs "-landroid -llog -lcamera2ndk")
+  ocv_add_external_target(android_native_camera "" "${libs}" "HAVE_ANDROID_NATIVE_CAMERA")
+endif()
+
+set(HAVE_ANDROID_NATIVE_CAMERA ${HAVE_ANDROID_NATIVE_CAMERA} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_gstreamer.cmake b/modules/videoio/cmake/detect_gstreamer.cmake
index 7b74693f5e..2198786161 100644
--- a/modules/videoio/cmake/detect_gstreamer.cmake
+++ b/modules/videoio/cmake/detect_gstreamer.cmake
@@ -40,6 +40,10 @@ if(NOT HAVE_GSTREAMER AND WIN32)
     NAMES gstriff gstriff-1.0
     PATHS ${env_paths}
     PATH_SUFFIXES "lib")
+  find_library(GSTREAMER_video_LIBRARY
+    NAMES gstvideo gstvideo-1.0
+    PATHS ${env_paths}
+    PATH_SUFFIXES "lib")
 
   find_library(GSTREAMER_glib_LIBRARY
     NAMES glib-2.0
@@ -58,6 +62,7 @@ if(NOT HAVE_GSTREAMER AND WIN32)
       AND GSTREAMER_base_LIBRARY
       AND GSTREAMER_pbutils_LIBRARY
       AND GSTREAMER_riff_LIBRARY
+      AND GSTREAMER_video_LIBRARY
       AND GSTREAMER_glib_LIBRARY
       AND GSTREAMER_gobject_LIBRARY)
     file(STRINGS "${GSTREAMER_gst_INCLUDE_DIR}/gst/gstversion.h" ver_strings REGEX "#define +GST_VERSION_(MAJOR|MINOR|MICRO|NANO).*")
@@ -71,6 +76,7 @@ if(NOT HAVE_GSTREAMER AND WIN32)
       ${GSTREAMER_base_LIBRARY}
       ${GSTREAMER_app_LIBRARY}
       ${GSTREAMER_riff_LIBRARY}
+      ${GSTREAMER_video_LIBRARY}
       ${GSTREAMER_pbutils_LIBRARY}
       ${GSTREAMER_glib_LIBRARY}
       ${GSTREAMER_gobject_LIBRARY})
@@ -86,11 +92,12 @@ if(NOT HAVE_GSTREAMER AND PKG_CONFIG_FOUND)
   ocv_check_modules(GSTREAMER_app gstreamer-app-1.0)
   ocv_check_modules(GSTREAMER_riff gstreamer-riff-1.0)
   ocv_check_modules(GSTREAMER_pbutils gstreamer-pbutils-1.0)
-  if(GSTREAMER_base_FOUND AND GSTREAMER_app_FOUND AND GSTREAMER_riff_FOUND AND GSTREAMER_pbutils_FOUND)
+  ocv_check_modules(GSTREAMER_video gstreamer-video-1.0)
+  if(GSTREAMER_base_FOUND AND GSTREAMER_app_FOUND AND GSTREAMER_riff_FOUND AND GSTREAMER_pbutils_FOUND AND GSTREAMER_video_FOUND)
     set(HAVE_GSTREAMER TRUE)
     set(GSTREAMER_VERSION ${GSTREAMER_base_VERSION} PARENT_SCOPE) # informational
-    set(GSTREAMER_LIBRARIES ${GSTREAMER_base_LIBRARIES} ${GSTREAMER_app_LIBRARIES} ${GSTREAMER_riff_LIBRARIES} ${GSTREAMER_pbutils_LIBRARIES})
-    set(GSTREAMER_INCLUDE_DIRS ${GSTREAMER_base_INCLUDE_DIRS} ${GSTREAMER_app_INCLUDE_DIRS} ${GSTREAMER_riff_INCLUDE_DIRS} ${GSTREAMER_pbutils_INCLUDE_DIRS})
+    set(GSTREAMER_LIBRARIES ${GSTREAMER_base_LIBRARIES} ${GSTREAMER_app_LIBRARIES} ${GSTREAMER_riff_LIBRARIES} ${GSTREAMER_pbutils_LIBRARIES} ${GSTREAMER_video_LIBRARIES})
+    set(GSTREAMER_INCLUDE_DIRS ${GSTREAMER_base_INCLUDE_DIRS} ${GSTREAMER_app_INCLUDE_DIRS} ${GSTREAMER_riff_INCLUDE_DIRS} ${GSTREAMER_pbutils_INCLUDE_DIRS} ${GSTREAMER_video_INCLUDE_DIRS})
   endif()
 endif()
 
diff --git a/modules/videoio/cmake/init.cmake b/modules/videoio/cmake/init.cmake
index 500b9386ff..310df2d249 100644
--- a/modules/videoio/cmake/init.cmake
+++ b/modules/videoio/cmake/init.cmake
@@ -1,31 +1,19 @@
-macro(add_backend backend_id cond_var)
-  if(${cond_var})
-    include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
-  endif()
-endmacro()
+include(FindPkgConfig)
 
-function(ocv_add_external_target name inc link def)
-  if(BUILD_SHARED_LIBS)
-    set(imp IMPORTED)
-  endif()
-  add_library(ocv.3rdparty.${name} INTERFACE ${imp})
-  set_target_properties(ocv.3rdparty.${name} PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${inc}"
-    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${inc}"
-    INTERFACE_COMPILE_DEFINITIONS "${def}")
-  # When cmake version is greater than or equal to 3.11, INTERFACE_LINK_LIBRARIES no longer applies to interface library
-  # See https://github.com/opencv/opencv/pull/18658
-  if (CMAKE_VERSION VERSION_LESS 3.11)
-    set_target_properties(ocv.3rdparty.${name} PROPERTIES
-      INTERFACE_LINK_LIBRARIES "${link}")
-  else()
-    target_link_libraries(ocv.3rdparty.${name} INTERFACE ${link})
-  endif()
-  #
-  if(NOT BUILD_SHARED_LIBS)
-    install(TARGETS ocv.3rdparty.${name} EXPORT OpenCVModules)
-  endif()
-endfunction()
+# FIXIT: stop using PARENT_SCOPE in dependencies
+if(PROJECT_NAME STREQUAL "OpenCV")
+  macro(add_backend backend_id cond_var)
+    if(${cond_var})
+      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+    endif()
+  endmacro()
+else()
+  function(add_backend backend_id cond_var)
+    if(${cond_var})
+      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+    endif()
+  endfunction()
+endif()
 
 add_backend("ffmpeg" WITH_FFMPEG)
 add_backend("gstreamer" WITH_GSTREAMER)
@@ -49,3 +37,4 @@ add_backend("dshow" WITH_DSHOW)
 add_backend("msmf" WITH_MSMF)
 
 add_backend("android_mediandk" WITH_ANDROID_MEDIANDK)
+add_backend("android_camera" WITH_ANDROID_NATIVE_CAMERA)
diff --git a/modules/videoio/cmake/plugin_standalone.cmake b/modules/videoio/cmake/plugin_standalone.cmake
deleted file mode 100644
index 190cb82808..0000000000
--- a/modules/videoio/cmake/plugin_standalone.cmake
+++ /dev/null
@@ -1,80 +0,0 @@
-#=============================================
-# standalone build
-
-include(FindPkgConfig)
-
-#=============================================
-# build with OpenCV
-include("${OpenCV_SOURCE_DIR}/cmake/OpenCVUtils.cmake")
-
-function(ocv_create_videoio_plugin default_name target target_desc videoio_src_file)
-
-  set(OPENCV_PLUGIN_NAME ${default_name} CACHE STRING "")
-  set(OPENCV_PLUGIN_DESTINATION "" CACHE PATH "")
-  project(${OPENCV_PLUGIN_NAME} LANGUAGES CXX)
-
-  set(BUILD_SHARED_LIBS ON CACHE BOOL "")
-  if(NOT BUILD_SHARED_LIBS)
-    message(FATAL_ERROR "Static plugin build does not make sense")
-  endif()
-
-  if(NOT OpenCV_SOURCE_DIR)
-    message(FATAL_ERROR "OpenCV_SOURCE_DIR must be set to build the plugin!")
-  endif()
-
-  include("${OpenCV_SOURCE_DIR}/modules/videoio/cmake/init.cmake")
-
-  if(NOT TARGET ${target})
-    message(FATAL_ERROR "${target_desc} was not found!")
-  endif()
-
-  get_filename_component(modules_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../.." ABSOLUTE)
-  set(videoio_ROOT "${modules_ROOT}/videoio")
-  set(core_ROOT "${modules_ROOT}/core")
-  set(imgproc_ROOT "${modules_ROOT}/imgproc")
-  set(imgcodecs_ROOT "${modules_ROOT}/imgcodecs")
-
-  add_library(${OPENCV_PLUGIN_NAME} MODULE
-      "${videoio_ROOT}/src/${videoio_src_file}"
-      ${OPENCV_PLUGIN_EXTRA_SRC_FILES}
-  )
-  target_include_directories(${OPENCV_PLUGIN_NAME} PRIVATE
-    "${CMAKE_CURRENT_BINARY_DIR}"
-    "${videoio_ROOT}/src"
-    "${videoio_ROOT}/include"
-    "${core_ROOT}/include"
-    "${imgproc_ROOT}/include"
-    "${imgcodecs_ROOT}/include"
-  )
-  target_compile_definitions(${OPENCV_PLUGIN_NAME} PRIVATE BUILD_PLUGIN)
-
-  target_link_libraries(${OPENCV_PLUGIN_NAME} PRIVATE ${target})
-  set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES
-    CXX_STANDARD 11
-    CXX_VISIBILITY_PRESET hidden
-  )
-
-  if(DEFINED OPENCV_PLUGIN_MODULE_PREFIX)
-    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES PREFIX "${OPENCV_PLUGIN_MODULE_PREFIX}")
-  endif()
-
-  # Hack for Windows
-  if(WIN32)
-    find_package(OpenCV REQUIRED core imgproc videoio)
-    target_link_libraries(${OPENCV_PLUGIN_NAME} PRIVATE ${OpenCV_LIBS})
-  endif()
-
-  if(NOT OpenCV_FOUND)  # build against sources (Linux)
-    file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/opencv2/opencv_modules.hpp" "#pragma once")
-  endif()
-
-  if(OPENCV_PLUGIN_DESTINATION)
-    set_target_properties(${OPENCV_PLUGIN_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OPENCV_PLUGIN_DESTINATION}")
-    message(STATUS "Output destination: ${OPENCV_PLUGIN_DESTINATION}")
-  endif()
-
-  install(TARGETS ${OPENCV_PLUGIN_NAME} LIBRARY DESTINATION . COMPONENT plugins)
-
-  message(STATUS "Library name: ${OPENCV_PLUGIN_NAME}")
-
-endfunction()
diff --git a/modules/videoio/doc/videoio_overview.markdown b/modules/videoio/doc/videoio_overview.markdown
index ebcfc57330..310c229811 100644
--- a/modules/videoio/doc/videoio_overview.markdown
+++ b/modules/videoio/doc/videoio_overview.markdown
@@ -3,7 +3,7 @@ Video I/O with OpenCV Overview {#videoio_overview}
 
 ### See also:
   - @ref videoio "Video I/O Code Reference"
-  - Tutorials: @ref tutorial_table_of_content_videoio
+  - Tutorials: @ref tutorial_table_of_content_app
 
 General Information
 ===================
@@ -15,7 +15,7 @@ I/O APIs used as backend.
 
 ![Video I/O with OpenCV](pics/videoio_overview.svg)
 
-Some backends such as (DSHOW) Direct Show, Microsoft Media Foundation (MSMF),
+Some backends such as Direct Show (DSHOW), Microsoft Media Foundation (MSMF),
 Video 4 Linux (V4L), etc... are interfaces to the video I/O library provided by the operating system.
 
 Some others backends like OpenNI2 for Kinect, Intel Perceptual Computing SDK, GStreamer,
@@ -90,7 +90,7 @@ The FFmpeg library
 OpenCV can use the FFmpeg library (http://ffmpeg.org/) as backend to record, convert and stream audio and video.
 FFmpeg is a complete, cross-reference solution. If you enable FFmpeg while configuring OpenCV than
 CMake will download and install the binaries in `OPENCV_SOURCE_CODE/3rdparty/ffmpeg/`. To use
-FFmpeg at runtime, you must deploy the FFMepg binaries with your application.
+FFmpeg at runtime, you must deploy the FFmpeg binaries with your application.
 
 @note FFmpeg is licensed under the GNU Lesser General Public License (LGPL) version 2.1 or later.
 See `OPENCV_SOURCE_CODE/3rdparty/ffmpeg/readme.txt` and http://ffmpeg.org/legal.html for details and
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index a75c2cf0d8..b7de247a1c 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -52,10 +52,11 @@
 
   ### See also:
   - @ref videoio_overview
-  - Tutorials: @ref tutorial_table_of_content_videoio
+  - Tutorials: @ref tutorial_table_of_content_app
   @{
     @defgroup videoio_flags_base Flags for video I/O
     @defgroup videoio_flags_others Additional flags for video I/O API backends
+    @defgroup videoio_hwaccel Hardware-accelerated video decoding and encoding
     @defgroup videoio_c C API for video I/O
     @defgroup videoio_ios iOS glue for video I/O
     @defgroup videoio_winrt WinRT glue for video I/O
@@ -78,7 +79,7 @@ namespace cv
 //! @{
 
 
-/** @brief %VideoCapture API backends identifier.
+/** @brief cv::VideoCapture API backends identifier.
 
 Select preferred API for a capture object.
 To be used in the VideoCapture::VideoCapture() constructor or VideoCapture::open()
@@ -124,7 +125,7 @@ enum VideoCaptureAPIs {
        CAP_UEYE         = 2500,         //!< uEye Camera API
      };
 
-/** @brief %VideoCapture generic properties identifier.
+/** @brief cv::VideoCapture generic properties identifier.
 
  Reading / writing properties involves many layers. Some unexpected result might happens along this chain.
  Effective behaviour depends from device hardware, driver and API Backend.
@@ -182,20 +183,28 @@ enum VideoCaptureProperties {
        CAP_PROP_BITRATE       =47, //!< (read-only) Video bitrate in kbits/s
        CAP_PROP_ORIENTATION_META=48, //!< (read-only) Frame rotation defined by stream meta (applicable for FFmpeg back-end only)
        CAP_PROP_ORIENTATION_AUTO=49, //!< if true - rotates output frames of CvCapture considering video file's metadata  (applicable for FFmpeg back-end only) (https://github.com/opencv/opencv/issues/15499)
+       CAP_PROP_HW_ACCELERATION=50, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in cv::VideoCapture constructor / .open() method. Default value is backend-specific.
+       CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available)
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
      };
 
-/** @brief %VideoWriter generic properties identifier.
+/** @brief cv::VideoWriter generic properties identifier.
  @sa VideoWriter::get(), VideoWriter::set()
 */
 enum VideoWriterProperties {
   VIDEOWRITER_PROP_QUALITY = 1,    //!< Current quality (0..100%) of the encoded videostream. Can be adjusted dynamically in some codecs.
   VIDEOWRITER_PROP_FRAMEBYTES = 2, //!< (Read-only): Size of just encoded video frame. Note that the encoding order may be different from representation order.
   VIDEOWRITER_PROP_NSTRIPES = 3,   //!< Number of stripes for parallel encoding. -1 for auto detection.
-  VIDEOWRITER_PROP_IS_COLOR = 4    //!< If it is not zero, the encoder will expect and encode color frames, otherwise it
+  VIDEOWRITER_PROP_IS_COLOR = 4,   //!< If it is not zero, the encoder will expect and encode color frames, otherwise it
                                    //!< will work with grayscale frames.
+  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to CV_8U.
+  VIDEOWRITER_PROP_HW_ACCELERATION = 6, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in VideoWriter constructor / .open() method. Default value is backend-specific.
+  VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available)
+#ifndef CV_DOXYGEN
+  CV__VIDEOWRITER_PROP_LATEST
+#endif
 };
 
 //! @} videoio_flags_base
@@ -203,6 +212,32 @@ enum VideoWriterProperties {
 //! @addtogroup videoio_flags_others
 //! @{
 
+/** @name Hardware acceleration support
+    @{
+*/
+
+/** @brief Video Acceleration type
+ *
+ * Used as value in #CAP_PROP_HW_ACCELERATION and #VIDEOWRITER_PROP_HW_ACCELERATION
+ *
+ * @note In case of FFmpeg backend, it translated to enum AVHWDeviceType (https://github.com/FFmpeg/FFmpeg/blob/master/libavutil/hwcontext.h)
+ */
+enum VideoAccelerationType
+{
+    VIDEO_ACCELERATION_NONE     =  0,  //!< Do not require any specific H/W acceleration, prefer software processing.
+                                       //!< Reading of this value means that special H/W accelerated handling is not added or not detected by OpenCV.
+
+    VIDEO_ACCELERATION_ANY      =  1,  //!< Prefer to use H/W acceleration. If no one supported, then fallback to software processing.
+                                       //!< @note H/W acceleration may require special configuration of used environment.
+                                       //!< @note Results in encoding scenario may differ between software and hardware accelerated encoders.
+
+    VIDEO_ACCELERATION_D3D11    =  2,  //!< DirectX 11
+    VIDEO_ACCELERATION_VAAPI    =  3,  //!< VAAPI
+    VIDEO_ACCELERATION_MFX      =  4,  //!< libmfx (Intel MediaSDK/oneVPL)
+};
+
+//! @} Hardware acceleration support
+
 /** @name IEEE 1394 drivers
     @{
 */
@@ -493,8 +528,9 @@ enum { CAP_PROP_XI_DOWNSAMPLING                                 = 400, //!< Chan
 
 //! @} XIMEA
 
-/** @name XIMEA Camera API
-*  @{
+
+/** @name ARAVIS Camera API
+    @{
 */
 
 //! Properties of cameras available through ARAVIS backend
@@ -504,7 +540,6 @@ enum { CAP_PROP_ARAVIS_AUTOTRIGGER                              = 600 //!< Autom
 //! @} ARAVIS
 
 /** @name AVFoundation framework for iOS
-    OS X Lion will have the same API
     @{
 */
 
@@ -516,6 +551,9 @@ enum { CAP_PROP_IOS_DEVICE_FOCUS        = 9001,
        CAP_PROP_IOS_DEVICE_TORCH        = 9005
      };
 
+//! @} AVFoundation framework for iOS
+
+
 /** @name Smartek Giganetix GigEVisionSDK
     @{
 */
@@ -650,6 +688,14 @@ public:
     */
     CV_WRAP explicit VideoCapture(const String& filename, int apiPreference = CAP_ANY);
 
+    /** @overload
+    @brief Opens a video file or a capturing device or an IP video stream for video capturing with API Preference and parameters
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+    */
+    CV_WRAP explicit VideoCapture(const String& filename, int apiPreference, const std::vector<int>& params);
+
     /** @overload
     @brief  Opens a camera for video capturing
 
@@ -662,6 +708,14 @@ public:
     */
     CV_WRAP explicit VideoCapture(int index, int apiPreference = CAP_ANY);
 
+    /** @overload
+    @brief Opens a camera for video capturing with API Preference and parameters
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+    */
+    CV_WRAP explicit VideoCapture(int index, int apiPreference, const std::vector<int>& params);
+
     /** @brief Default destructor
 
     The method first calls VideoCapture::release to close the already opened file or camera.
@@ -683,6 +737,19 @@ public:
 
     @overload
 
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+
+    @return `true` if the file has been successfully opened
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+     */
+    CV_WRAP virtual bool open(const String& filename, int apiPreference, const std::vector<int>& params);
+
+    /** @brief  Opens a camera for video capturing
+
+    @overload
+
     Parameters are same as the constructor VideoCapture(int index, int apiPreference = CAP_ANY)
     @return `true` if the camera has been successfully opened.
 
@@ -692,6 +759,19 @@ public:
 
     /** @brief Returns true if video capturing has been initialized already.
 
+    @overload
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+
+    @return `true` if the camera has been successfully opened.
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+    */
+    CV_WRAP virtual bool open(int index, int apiPreference, const std::vector<int>& params);
+
+    /** @brief Returns true if video capturing has been initialized already.
+
     If the previous call to VideoCapture constructor or VideoCapture::open() succeeded, the method returns
     true.
      */
@@ -1019,8 +1099,10 @@ protected:
                                     Size frameSize, bool isColor = true);
 };
 
+//! @cond IGNORED
 template<> struct DefaultDeleter<CvCapture>{ CV_EXPORTS void operator ()(CvCapture* obj) const; };
 template<> struct DefaultDeleter<CvVideoWriter>{ CV_EXPORTS void operator ()(CvVideoWriter* obj) const; };
+//! @endcond IGNORED
 
 //! @} videoio
 
diff --git a/modules/videoio/include/opencv2/videoio/doc/hwaccel.doc.hpp b/modules/videoio/include/opencv2/videoio/doc/hwaccel.doc.hpp
new file mode 100644
index 0000000000..3b03aece35
--- /dev/null
+++ b/modules/videoio/include/opencv2/videoio/doc/hwaccel.doc.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// This file should not be used with compiler (documentation only)
+//
+
+namespace cv {
+/** @addtogroup videoio_hwaccel
+This section contains information about API to control Hardware-accelerated video decoding and encoding.
+
+@note Check [Wiki page](https://github.com/opencv/opencv/wiki/Video-IO-hardware-acceleration)
+for description of supported hardware / software configurations and available benchmarks
+
+cv::VideoCapture properties:
+- #CAP_PROP_HW_ACCELERATION (as #VideoAccelerationType)
+- #CAP_PROP_HW_DEVICE
+
+cv::VideoWriter properties:
+- #VIDEOWRITER_PROP_HW_ACCELERATION (as #VideoAccelerationType)
+- #VIDEOWRITER_PROP_HW_DEVICE
+
+Properties are supported by these backends:
+
+- #CAP_FFMPEG
+- #CAP_GSTREAMER
+- #CAP_MSMF (Windows)
+
+@{
+ */
+
+/** @} */
+}  // namespace
diff --git a/modules/videoio/misc/objc/gen_dict.json b/modules/videoio/misc/objc/gen_dict.json
index e71c9ec3d7..70dc844f0c 100644
--- a/modules/videoio/misc/objc/gen_dict.json
+++ b/modules/videoio/misc/objc/gen_dict.json
@@ -13,7 +13,8 @@
     },
     "func_arg_fix" : {
         "VideoCapture" : {
-            "(BOOL)open:(int)index apiPreference:(int)apiPreference" : { "open" : {"name" : "openWithIndex"} }
+            "(BOOL)open:(int)index apiPreference:(int)apiPreference" : { "open" : {"name" : "openWithIndex"} },
+            "(BOOL)open:(int)index apiPreference:(int)apiPreference params:(IntVector*)params" : { "open" : {"name" : "openWithIndexAndParameters"} }
         }
     }
 }
diff --git a/modules/videoio/misc/plugin_ffmpeg/CMakeLists.txt b/modules/videoio/misc/plugin_ffmpeg/CMakeLists.txt
index 55f3c2945f..ebe388a886 100644
--- a/modules/videoio/misc/plugin_ffmpeg/CMakeLists.txt
+++ b/modules/videoio/misc/plugin_ffmpeg/CMakeLists.txt
@@ -1,10 +1,15 @@
 cmake_minimum_required(VERSION 3.5)
 
-set(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../..")
+get_filename_component(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../.." ABSOLUTE)
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVPluginStandalone.cmake")
+
+# scan dependencies
 set(WITH_FFMPEG ON)
 set(OPENCV_FFMPEG_SKIP_BUILD_CHECK ON)
-include("${OpenCV_SOURCE_DIR}/modules/videoio/cmake/plugin_standalone.cmake")
-ocv_create_videoio_plugin("opencv_videoio_ffmpeg" "ocv.3rdparty.ffmpeg" "FFmpeg" "cap_ffmpeg.cpp")
+include("${OpenCV_SOURCE_DIR}/modules/videoio/cmake/init.cmake")
+
+set(OPENCV_PLUGIN_DEPS core imgproc imgcodecs)
+ocv_create_plugin(videoio "opencv_videoio_ffmpeg" "ocv.3rdparty.ffmpeg" "FFmpeg" "src/cap_ffmpeg.cpp")
 
 message(STATUS "FFMPEG_libavcodec_VERSION=${FFMPEG_libavcodec_VERSION}")
 message(STATUS "FFMPEG_libavformat_VERSION=${FFMPEG_libavformat_VERSION}")
diff --git a/modules/videoio/misc/plugin_gstreamer/CMakeLists.txt b/modules/videoio/misc/plugin_gstreamer/CMakeLists.txt
index 1361487e9b..b8e7aa5b55 100644
--- a/modules/videoio/misc/plugin_gstreamer/CMakeLists.txt
+++ b/modules/videoio/misc/plugin_gstreamer/CMakeLists.txt
@@ -1,8 +1,13 @@
 cmake_minimum_required(VERSION 3.5)
 
-set(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../..")
+get_filename_component(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../.." ABSOLUTE)
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVPluginStandalone.cmake")
+
+# scan dependencies
 set(WITH_GSTREAMER ON)
-include("${OpenCV_SOURCE_DIR}/modules/videoio/cmake/plugin_standalone.cmake")
-ocv_create_videoio_plugin("opencv_videoio_gstreamer" "ocv.3rdparty.gstreamer" "GStreamer" "cap_gstreamer.cpp")
+include("${OpenCV_SOURCE_DIR}/modules/videoio/cmake/init.cmake")
+
+set(OPENCV_PLUGIN_DEPS core imgproc imgcodecs)
+ocv_create_plugin(videoio "opencv_videoio_gstreamer" "ocv.3rdparty.gstreamer" "GStreamer" "src/cap_gstreamer.cpp")
 
 message(STATUS "Using GStreamer: ${GSTREAMER_VERSION}")
diff --git a/modules/videoio/src/backend.hpp b/modules/videoio/src/backend.hpp
index 1b2a1e6cbb..ecf0e0d1d3 100644
--- a/modules/videoio/src/backend.hpp
+++ b/modules/videoio/src/backend.hpp
@@ -16,8 +16,8 @@ class IBackend
 {
 public:
     virtual ~IBackend() {}
-    virtual Ptr<IVideoCapture> createCapture(int camera) const = 0;
-    virtual Ptr<IVideoCapture> createCapture(const std::string &filename) const = 0;
+    virtual Ptr<IVideoCapture> createCapture(int camera, const VideoCaptureParameters& params) const = 0;
+    virtual Ptr<IVideoCapture> createCapture(const std::string &filename, const VideoCaptureParameters& params) const = 0;
     virtual Ptr<IVideoWriter> createWriter(const std::string& filename, int fourcc, double fps, const cv::Size& sz,
                                            const VideoWriterParameters& params) const = 0;
 };
@@ -33,14 +33,21 @@ public:
 
 typedef Ptr<IVideoCapture> (*FN_createCaptureFile)(const std::string & filename);
 typedef Ptr<IVideoCapture> (*FN_createCaptureCamera)(int camera);
+typedef Ptr<IVideoCapture> (*FN_createCaptureFileWithParams)(const std::string & filename, const VideoCaptureParameters& params);
+typedef Ptr<IVideoCapture> (*FN_createCaptureCameraWithParams)(int camera, const VideoCaptureParameters& params);
 typedef Ptr<IVideoWriter>  (*FN_createWriter)(const std::string& filename, int fourcc, double fps, const Size& sz,
                                               const VideoWriterParameters& params);
 Ptr<IBackendFactory> createBackendFactory(FN_createCaptureFile createCaptureFile,
                                           FN_createCaptureCamera createCaptureCamera,
                                           FN_createWriter createWriter);
+Ptr<IBackendFactory> createBackendFactory(FN_createCaptureFileWithParams createCaptureFile,
+                                          FN_createCaptureCameraWithParams createCaptureCamera,
+                                          FN_createWriter createWriter);
 
 Ptr<IBackendFactory> createPluginBackendFactory(VideoCaptureAPIs id, const char* baseName);
 
+void applyParametersFallback(const Ptr<IVideoCapture>& cap, const VideoCaptureParameters& params);
+
 } // namespace cv::
 
 #endif // BACKEND_HPP_DEFINED
diff --git a/modules/videoio/src/backend_plugin.cpp b/modules/videoio/src/backend_plugin.cpp
index ab82e099ca..d3b331949a 100644
--- a/modules/videoio/src/backend_plugin.cpp
+++ b/modules/videoio/src/backend_plugin.cpp
@@ -6,65 +6,29 @@
 
 #include "backend.hpp"
 #include "plugin_api.hpp"
+#include "plugin_capture_api.hpp"
+#include "plugin_writer_api.hpp"
 
-#include "opencv2/core/utils/filesystem.hpp"
 #include "opencv2/core/utils/configuration.private.hpp"
+#include "opencv2/core/utils/logger.hpp"
+
 #include "opencv2/core/private.hpp"
 #include "videoio_registry.hpp"
 
 //==================================================================================================
 // Dynamic backend implementation
 
-#include "opencv2/core/utils/logger.hpp"
-#include <sstream>
-using namespace std;
+#include "opencv2/core/utils/plugin_loader.private.hpp"
+
+
+#include "backend_plugin_legacy.impl.hpp"
 
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
-#include <dlfcn.h>
-#endif
 
 namespace cv { namespace impl {
 
-#if defined(_WIN32)
-typedef HMODULE LibHandle_t;
-typedef wchar_t FileSystemChar_t;
-typedef std::wstring FileSystemPath_t;
-
-static
-FileSystemPath_t toFileSystemPath(const std::string& p)
-{
-    FileSystemPath_t result;
-    result.resize(p.size());
-    for (size_t i = 0; i < p.size(); i++)
-        result[i] = (wchar_t)p[i];
-    return result;
-}
-static
-std::string toPrintablePath(const FileSystemPath_t& p)
-{
-    std::string result;
-    result.resize(p.size());
-    for (size_t i = 0; i < p.size(); i++)
-    {
-        wchar_t ch = p[i];
-        if ((int)ch >= ' ' && (int)ch < 128)
-            result[i] = (char)ch;
-        else
-            result[i] = '?';
-    }
-    return result;
-}
-#else  // !_WIN32
-typedef void* LibHandle_t;
-typedef char FileSystemChar_t;
-typedef std::string FileSystemPath_t;
-
-static inline FileSystemPath_t toFileSystemPath(const std::string& p) { return p; }
-static inline std::string toPrintablePath(const FileSystemPath_t& p) { return p; }
-#endif
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
 
+using namespace cv::plugin::impl;  // plugin_loader.hpp
 
 static Mutex& getInitializationMutex()
 {
@@ -72,179 +36,182 @@ static Mutex& getInitializationMutex()
     return initializationMutex;
 }
 
-static inline
-void* getSymbol_(LibHandle_t h, const char* symbolName)
-{
-#if defined(_WIN32)
-    return (void*)GetProcAddress(h, symbolName);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
-    return dlsym(h, symbolName);
-#endif
-}
-
-static inline
-LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
-{
-#if defined(_WIN32)
-# ifdef WINRT
-    return LoadPackagedLibrary(filename.c_str(), 0);
-# else
-    return LoadLibraryW(filename.c_str());
-#endif
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
-    return dlopen(filename.c_str(), RTLD_LAZY);
-#endif
-}
-
-static inline
-void libraryRelease_(LibHandle_t h)
-{
-#if defined(_WIN32)
-    FreeLibrary(h);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
-    dlclose(h);
-#endif
-}
-
-static inline
-std::string libraryPrefix()
-{
-#if defined(_WIN32)
-    return "";
-#else
-    return "lib";
-#endif
-}
-static inline
-std::string librarySuffix()
-{
-#if defined(_WIN32)
-    const char* suffix = ""
-        CVAUX_STR(CV_MAJOR_VERSION) CVAUX_STR(CV_MINOR_VERSION) CVAUX_STR(CV_SUBMINOR_VERSION)
-    #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
-        "_64"
-    #endif
-    #if defined(_DEBUG) && defined(DEBUG_POSTFIX)
-        CVAUX_STR(DEBUG_POSTFIX)
-    #endif
-        ".dll";
-    return suffix;
-#else
-    return ".so";
-#endif
-}
-
-//============================
-
-class DynamicLib
-{
-private:
-    LibHandle_t handle;
-    const FileSystemPath_t fname;
-
-public:
-    DynamicLib(const FileSystemPath_t& filename)
-        : handle(0), fname(filename)
-    {
-        libraryLoad(filename);
-    }
-    ~DynamicLib()
-    {
-        libraryRelease();
-    }
-    bool isLoaded() const
-    {
-        return handle != NULL;
-    }
-    void* getSymbol(const char* symbolName) const
-    {
-        if (!handle)
-        {
-            return 0;
-        }
-        void * res = getSymbol_(handle, symbolName);
-        if (!res)
-            CV_LOG_ERROR(NULL, "No symbol '" << symbolName << "' in " << toPrintablePath(fname));
-        return res;
-    }
-    const std::string getName() const { return toPrintablePath(fname); }
-private:
-    void libraryLoad(const FileSystemPath_t& filename)
-    {
-        handle = libraryLoad_(filename);
-        CV_LOG_INFO(NULL, "load " << toPrintablePath(filename) << " => " << (handle ? "OK" : "FAILED"));
-    }
-    void libraryRelease()
-    {
-        if (handle)
-        {
-            CV_LOG_INFO(NULL, "unload "<< toPrintablePath(fname));
-            libraryRelease_(handle);
-            handle = 0;
-        }
-    }
-
-private:
-    DynamicLib(const DynamicLib &);
-    DynamicLib &operator=(const DynamicLib &);
-};
-
-
-//============================
 
 class PluginBackend: public IBackend
 {
-public:
-    Ptr<DynamicLib> lib_;
-    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;
+protected:
 
-    PluginBackend(const Ptr<DynamicLib>& lib) :
-        lib_(lib), plugin_api_(NULL)
+    void initCaptureAPI()
+    {
+        const char* init_name = "opencv_videoio_capture_plugin_init_v1";
+        FN_opencv_videoio_capture_plugin_init_t fn_init = reinterpret_cast<FN_opencv_videoio_capture_plugin_init_t>(lib_->getSymbol(init_name));
+        if (fn_init)
+        {
+            CV_LOG_INFO(NULL, "Found entry: '" << init_name << "'");
+            for (int supported_api_version = CAPTURE_API_VERSION; supported_api_version >= 0; supported_api_version--)
+            {
+                capture_api_ = fn_init(CAPTURE_ABI_VERSION, supported_api_version, NULL);
+                if (capture_api_)
+                    break;
+            }
+            if (!capture_api_)
+            {
+                CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible (can't be initialized): " << lib_->getName());
+                return;
+            }
+            if (!checkCompatibility(
+                    capture_api_->api_header, CAPTURE_ABI_VERSION, CAPTURE_API_VERSION,
+                    capture_api_->v0.id != CAP_FFMPEG))
+            {
+                capture_api_ = NULL;
+                return;
+            }
+            CV_LOG_INFO(NULL, "Video I/O: plugin is ready to use '" << capture_api_->api_header.api_description << "'");
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "Video I/O: missing plugin init function: '" << init_name << "', file: " << lib_->getName());
+        }
+    }
+
+
+    void initWriterAPI()
+    {
+        const char* init_name = "opencv_videoio_writer_plugin_init_v1";
+        FN_opencv_videoio_writer_plugin_init_t fn_init = reinterpret_cast<FN_opencv_videoio_writer_plugin_init_t>(lib_->getSymbol(init_name));
+        if (fn_init)
+        {
+            CV_LOG_INFO(NULL, "Found entry: '" << init_name << "'");
+            for (int supported_api_version = WRITER_API_VERSION; supported_api_version >= 0; supported_api_version--)
+            {
+                writer_api_ = fn_init(WRITER_ABI_VERSION, supported_api_version, NULL);
+                if (writer_api_)
+                    break;
+            }
+            if (!writer_api_)
+            {
+                CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible (can't be initialized): " << lib_->getName());
+                return;
+            }
+            if (!checkCompatibility(
+                    writer_api_->api_header, WRITER_ABI_VERSION, WRITER_API_VERSION,
+                    writer_api_->v0.id != CAP_FFMPEG))
+            {
+                writer_api_ = NULL;
+                return;
+            }
+            CV_LOG_INFO(NULL, "Video I/O: plugin is ready to use '" << writer_api_->api_header.api_description << "'");
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "Video I/O: missing plugin init function: '" << init_name << "', file: " << lib_->getName());
+        }
+    }
+
+
+    void initPluginLegacyAPI()
     {
         const char* init_name = "opencv_videoio_plugin_init_v0";
         FN_opencv_videoio_plugin_init_t fn_init = reinterpret_cast<FN_opencv_videoio_plugin_init_t>(lib_->getSymbol(init_name));
         if (fn_init)
         {
-            plugin_api_ = fn_init(ABI_VERSION, API_VERSION, NULL);
+            CV_LOG_INFO(NULL, "Found entry: '" << init_name << "'");
+            for (int supported_api_version = API_VERSION; supported_api_version >= 0; supported_api_version--)
+            {
+                plugin_api_ = fn_init(ABI_VERSION, supported_api_version, NULL);
+                if (plugin_api_)
+                    break;
+            }
             if (!plugin_api_)
             {
-                CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible: " << lib->getName());
+                CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible (can't be initialized): " << lib_->getName());
                 return;
             }
-#if 0  // FIXIT: OpenCV 5.0
-            if (plugin_api_->api_header.opencv_version_major != CV_VERSION_MAJOR)
+            if (!checkCompatibility(
+                    plugin_api_->api_header, ABI_VERSION, API_VERSION,
+                    plugin_api_->v0.captureAPI != CAP_FFMPEG))
             {
-                CV_LOG_ERROR(NULL, "Video I/O: wrong OpenCV major version used by plugin '" << plugin_api_->api_header.api_description << "': " <<
-                    cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", plugin_api_->api_header.opencv_version_major, plugin_api_->api_header.opencv_version_minor))
                 plugin_api_ = NULL;
                 return;
             }
-#ifdef HAVE_FFMPEG_WRAPPER
-            if (plugin_api_->captureAPI == CAP_FFMPEG)
-            {
-                // no checks for OpenCV minor version
-            }
-            else
-#endif
-            if (plugin_api_->api_header.opencv_version_minor != CV_VERSION_MINOR)
-            {
-                CV_LOG_ERROR(NULL, "Video I/O: wrong OpenCV minor version used by plugin '" << plugin_api_->api_header.api_description << "': " <<
-                    cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", plugin_api_->api_header.opencv_version_major, plugin_api_->api_header.opencv_version_minor))
-                plugin_api_ = NULL;
-                return;
-            }
-#endif
-            // TODO Preview: add compatibility API/ABI checks
-            CV_LOG_INFO(NULL, "Video I/O: loaded plugin '" << plugin_api_->api_header.api_description << "'");
+            CV_LOG_INFO(NULL, "Video I/O: plugin is ready to use '" << plugin_api_->api_header.api_description << "'");
         }
         else
         {
-            CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible, missing init function: '" << init_name << "', file: " << lib->getName());
+            CV_LOG_INFO(NULL, "Video I/O: plugin is incompatible, missing init function: '" << init_name << "', file: " << lib_->getName());
         }
     }
 
-    Ptr<IVideoCapture> createCapture(int camera) const CV_OVERRIDE;
-    Ptr<IVideoCapture> createCapture(const std::string &filename) const CV_OVERRIDE;
+
+    bool checkCompatibility(const OpenCV_API_Header& api_header, unsigned int abi_version, unsigned int api_version, bool checkMinorOpenCVVersion)
+    {
+#if 0  // FIXIT: OpenCV 5.0
+        if (api_header.opencv_version_major != CV_VERSION_MAJOR)
+        {
+            CV_LOG_ERROR(NULL, "Video I/O: wrong OpenCV major version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+        if (!checkMinorOpenCVVersion)
+        {
+            // no checks for OpenCV minor version
+        }
+        else if (api_header.opencv_version_minor != CV_VERSION_MINOR)
+        {
+            CV_LOG_ERROR(NULL, "Video I/O: wrong OpenCV minor version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+#else
+        CV_UNUSED(checkMinorOpenCVVersion);
+#endif
+        CV_LOG_INFO(NULL, "Video I/O: initialized '" << api_header.api_description << "': built with "
+            << cv::format("OpenCV %d.%d (ABI/API = %d/%d)",
+                 api_header.opencv_version_major, api_header.opencv_version_minor,
+                 api_header.min_api_version, api_header.api_version)
+            << ", current OpenCV version is '" CV_VERSION "' (ABI/API = " << abi_version << "/" << api_version << ")"
+        );
+        if (api_header.min_api_version != abi_version)  // future: range can be here
+        {
+            // actually this should never happen due to checks in plugin's init() function
+            CV_LOG_ERROR(NULL, "Video I/O: plugin is not supported due to incompatible ABI = " << api_header.min_api_version);
+            return false;
+        }
+        if (api_header.api_version != api_version)
+        {
+            CV_LOG_INFO(NULL, "Video I/O: NOTE: plugin is supported, but there is API version mismath: "
+                << cv::format("plugin API level (%d) != OpenCV API level (%d)", api_header.api_version, api_version));
+            if (api_header.api_version < api_version)
+            {
+                CV_LOG_INFO(NULL, "Video I/O: NOTE: some functionality may be unavailable due to lack of support by plugin implementation");
+            }
+        }
+        return true;
+    }
+
+public:
+    Ptr<cv::plugin::impl::DynamicLib> lib_;
+    const OpenCV_VideoIO_Capture_Plugin_API* capture_api_;
+    const OpenCV_VideoIO_Writer_Plugin_API* writer_api_;
+    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;  //!< deprecated
+
+    PluginBackend(const Ptr<cv::plugin::impl::DynamicLib>& lib)
+        : lib_(lib)
+        , capture_api_(NULL), writer_api_(NULL)
+        , plugin_api_(NULL)
+    {
+        initCaptureAPI();
+        initWriterAPI();
+        if (capture_api_ == NULL && writer_api_ == NULL)
+        {
+            initPluginLegacyAPI();
+        }
+    }
+
+    Ptr<IVideoCapture> createCapture(int camera) const;
+    Ptr<IVideoCapture> createCapture(int camera, const VideoCaptureParameters& params) const CV_OVERRIDE;
+    Ptr<IVideoCapture> createCapture(const std::string &filename) const;
+    Ptr<IVideoCapture> createCapture(const std::string &filename, const VideoCaptureParameters& params) const CV_OVERRIDE;
     Ptr<IVideoWriter> createWriter(const std::string& filename, int fourcc, double fps,
                                    const cv::Size& sz, const VideoWriterParameters& params) const CV_OVERRIDE;
 };
@@ -294,11 +261,11 @@ std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
 {
     using namespace cv::utils;
     using namespace cv::utils::fs;
-    const string baseName_l = toLowerCase(baseName);
-    const string baseName_u = toUpperCase(baseName);
+    const std::string baseName_l = toLowerCase(baseName);
+    const std::string baseName_u = toUpperCase(baseName);
     const FileSystemPath_t baseName_l_fs = toFileSystemPath(baseName_l);
-    vector<FileSystemPath_t> paths;
-    const vector<string> paths_ = getConfigurationParameterPaths("OPENCV_VIDEOIO_PLUGIN_PATH", vector<string>());
+    std::vector<FileSystemPath_t> paths;
+    const std::vector<std::string> paths_ = getConfigurationParameterPaths("OPENCV_VIDEOIO_PLUGIN_PATH", std::vector<std::string>());
     if (paths_.size() != 0)
     {
         for (size_t i = 0; i < paths_.size(); i++)
@@ -319,9 +286,9 @@ std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
 #endif
         }
     }
-    const string default_expr = libraryPrefix() + "opencv_videoio_" + baseName_l + "*" + librarySuffix();
-    const string plugin_expr = getConfigurationParameterString((std::string("OPENCV_VIDEOIO_PLUGIN_") + baseName_u).c_str(), default_expr.c_str());
-    vector<FileSystemPath_t> results;
+    const std::string default_expr = libraryPrefix() + "opencv_videoio_" + baseName_l + "*" + librarySuffix();
+    const std::string plugin_expr = getConfigurationParameterString((std::string("OPENCV_VIDEOIO_PLUGIN_") + baseName_u).c_str(), default_expr.c_str());
+    std::vector<FileSystemPath_t> results;
 #ifdef _WIN32
     FileSystemPath_t moduleName = toFileSystemPath(libraryPrefix() + "opencv_videoio_" + baseName_l + librarySuffix());
 #ifndef WINRT
@@ -358,12 +325,12 @@ std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
     }
 #endif // _DEBUG && DEBUG_POSTFIX
 #else
-    CV_LOG_INFO(NULL, "VideoIO pluigin (" << baseName << "): glob is '" << plugin_expr << "', " << paths.size() << " location(s)");
-    for (const string & path : paths)
+    CV_LOG_INFO(NULL, "VideoIO plugin (" << baseName << "): glob is '" << plugin_expr << "', " << paths.size() << " location(s)");
+    for (const std::string& path : paths)
     {
         if (path.empty())
             continue;
-        vector<string> candidates;
+        std::vector<std::string> candidates;
         cv::glob(utils::fs::join(path, plugin_expr), candidates);
         CV_LOG_INFO(NULL, "    - " << path << ": " << candidates.size());
         copy(candidates.begin(), candidates.end(), back_inserter(results));
@@ -377,26 +344,52 @@ void PluginBackendFactory::loadPlugin()
 {
     for (const FileSystemPath_t& plugin : getPluginCandidates(baseName_))
     {
-        Ptr<DynamicLib> lib = makePtr<DynamicLib>(plugin);
+        auto lib = makePtr<cv::plugin::impl::DynamicLib>(plugin);
         if (!lib->isLoaded())
             continue;
         try
         {
             Ptr<PluginBackend> pluginBackend = makePtr<PluginBackend>(lib);
-            if (pluginBackend && pluginBackend->plugin_api_)
+            if (!pluginBackend)
+                return;
+            if (pluginBackend->capture_api_)
             {
-                if (pluginBackend->plugin_api_->captureAPI != id_)
+                if (pluginBackend->capture_api_->v0.id != id_)
                 {
-                    CV_LOG_ERROR(NULL, "Video I/O: plugin '" << pluginBackend->plugin_api_->api_header.api_description <<
+                    CV_LOG_ERROR(NULL, "Video I/O: plugin '" << pluginBackend->capture_api_->api_header.api_description <<
                                        "': unexpected backend ID: " <<
-                                       pluginBackend->plugin_api_->captureAPI << " vs " << (int)id_ << " (expected)");
-                }
-                else
-                {
-                    backend = pluginBackend;
+                                       pluginBackend->capture_api_->v0.id << " vs " << (int)id_ << " (expected)");
                     return;
                 }
             }
+            if (pluginBackend->writer_api_)
+            {
+                if (pluginBackend->writer_api_->v0.id != id_)
+                {
+                    CV_LOG_ERROR(NULL, "Video I/O: plugin '" << pluginBackend->writer_api_->api_header.api_description <<
+                                       "': unexpected backend ID: " <<
+                                       pluginBackend->writer_api_->v0.id << " vs " << (int)id_ << " (expected)");
+                    return;
+                }
+            }
+            if (pluginBackend->plugin_api_)
+            {
+                if (pluginBackend->plugin_api_->v0.captureAPI != id_)
+                {
+                    CV_LOG_ERROR(NULL, "Video I/O: plugin '" << pluginBackend->plugin_api_->api_header.api_description <<
+                                       "': unexpected backend ID: " <<
+                                       pluginBackend->plugin_api_->v0.captureAPI << " vs " << (int)id_ << " (expected)");
+                    return;
+                }
+            }
+            if (pluginBackend->capture_api_ == NULL && pluginBackend->writer_api_ == NULL
+                && pluginBackend->plugin_api_ == NULL)
+            {
+                CV_LOG_ERROR(NULL, "Video I/O: no compatible plugin API for backend ID: " << (int)id_);
+                return;
+            }
+            backend = pluginBackend;
+            return;
         }
         catch (...)
         {
@@ -410,29 +403,50 @@ void PluginBackendFactory::loadPlugin()
 
 class PluginCapture : public cv::IVideoCapture
 {
-    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;
+    const OpenCV_VideoIO_Capture_Plugin_API* plugin_api_;
     CvPluginCapture capture_;
 
 public:
     static
-    Ptr<PluginCapture> create(const OpenCV_VideoIO_Plugin_API_preview* plugin_api,
-            const std::string &filename, int camera)
+    Ptr<PluginCapture> create(const OpenCV_VideoIO_Capture_Plugin_API* plugin_api,
+            const std::string &filename, int camera, const VideoCaptureParameters& params)
     {
         CV_Assert(plugin_api);
+        CV_Assert(plugin_api->v0.Capture_release);
+
         CvPluginCapture capture = NULL;
-        if (plugin_api->Capture_open)
+
+        if (plugin_api->api_header.api_version >= 1 && plugin_api->v1.Capture_open_with_params)
         {
-            CV_Assert(plugin_api->Capture_release);
-            if (CV_ERROR_OK == plugin_api->Capture_open(filename.empty() ? 0 : filename.c_str(), camera, &capture))
+            std::vector<int> vint_params = params.getIntVector();
+            int* c_params = vint_params.data();
+            unsigned n_params = (unsigned)(vint_params.size() / 2);
+
+            if (CV_ERROR_OK == plugin_api->v1.Capture_open_with_params(
+                    filename.empty() ? 0 : filename.c_str(), camera, c_params, n_params, &capture))
             {
                 CV_Assert(capture);
                 return makePtr<PluginCapture>(plugin_api, capture);
             }
         }
+        else if (plugin_api->v0.Capture_open)
+        {
+            if (CV_ERROR_OK == plugin_api->v0.Capture_open(filename.empty() ? 0 : filename.c_str(), camera, &capture))
+            {
+                CV_Assert(capture);
+                Ptr<PluginCapture> cap = makePtr<PluginCapture>(plugin_api, capture);
+                if (cap && !params.empty())
+                {
+                    applyParametersFallback(cap, params);
+                }
+                return cap;
+            }
+        }
+
         return Ptr<PluginCapture>();
     }
 
-    PluginCapture(const OpenCV_VideoIO_Plugin_API_preview* plugin_api, CvPluginCapture capture)
+    PluginCapture(const OpenCV_VideoIO_Capture_Plugin_API* plugin_api, CvPluginCapture capture)
         : plugin_api_(plugin_api), capture_(capture)
     {
         CV_Assert(plugin_api_); CV_Assert(capture_);
@@ -440,47 +454,47 @@ public:
 
     ~PluginCapture()
     {
-        CV_DbgAssert(plugin_api_->Capture_release);
-        if (CV_ERROR_OK != plugin_api_->Capture_release(capture_))
+        CV_DbgAssert(plugin_api_->v0.Capture_release);
+        if (CV_ERROR_OK != plugin_api_->v0.Capture_release(capture_))
             CV_LOG_ERROR(NULL, "Video I/O: Can't release capture by plugin '" << plugin_api_->api_header.api_description << "'");
         capture_ = NULL;
     }
     double getProperty(int prop) const CV_OVERRIDE
     {
         double val = -1;
-        if (plugin_api_->Capture_getProperty)
-            if (CV_ERROR_OK != plugin_api_->Capture_getProperty(capture_, prop, &val))
+        if (plugin_api_->v0.Capture_getProperty)
+            if (CV_ERROR_OK != plugin_api_->v0.Capture_getProperty(capture_, prop, &val))
                 val = -1;
         return val;
     }
     bool setProperty(int prop, double val) CV_OVERRIDE
     {
-        if (plugin_api_->Capture_setProperty)
-            if (CV_ERROR_OK == plugin_api_->Capture_setProperty(capture_, prop, val))
+        if (plugin_api_->v0.Capture_setProperty)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_setProperty(capture_, prop, val))
                 return true;
         return false;
     }
     bool grabFrame() CV_OVERRIDE
     {
-        if (plugin_api_->Capture_grab)
-            if (CV_ERROR_OK == plugin_api_->Capture_grab(capture_))
+        if (plugin_api_->v0.Capture_grab)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_grab(capture_))
                 return true;
         return false;
     }
-    static CvResult CV_API_CALL retrieve_callback(int stream_idx, const unsigned char* data, int step, int width, int height, int cn, void* userdata)
+    static CvResult CV_API_CALL retrieve_callback(int stream_idx, const unsigned char* data, int step, int width, int height, int type, void* userdata)
     {
         CV_UNUSED(stream_idx);
         cv::_OutputArray* dst = static_cast<cv::_OutputArray*>(userdata);
         if (!dst)
             return CV_ERROR_FAIL;
-        cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, cn), (void*)data, step).copyTo(*dst);
+        cv::Mat(cv::Size(width, height), type, (void*)data, step).copyTo(*dst);
         return CV_ERROR_OK;
     }
     bool retrieveFrame(int idx, cv::OutputArray img) CV_OVERRIDE
     {
         bool res = false;
-        if (plugin_api_->Capture_retreive)
-            if (CV_ERROR_OK == plugin_api_->Capture_retreive(capture_, idx, retrieve_callback, (cv::_OutputArray*)&img))
+        if (plugin_api_->v0.Capture_retreive)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_retreive(capture_, idx, retrieve_callback, (cv::_OutputArray*)&img))
                 res = true;
         return res;
     }
@@ -490,7 +504,7 @@ public:
     }
     int getCaptureDomain() CV_OVERRIDE
     {
-        return plugin_api_->captureAPI;
+        return plugin_api_->v0.id;
     }
 };
 
@@ -499,32 +513,58 @@ public:
 
 class PluginWriter : public cv::IVideoWriter
 {
-    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;
+    const OpenCV_VideoIO_Writer_Plugin_API* plugin_api_;
     CvPluginWriter writer_;
 
 public:
     static
-    Ptr<PluginWriter> create(const OpenCV_VideoIO_Plugin_API_preview* plugin_api,
+    Ptr<PluginWriter> create(const OpenCV_VideoIO_Writer_Plugin_API* plugin_api,
             const std::string& filename, int fourcc, double fps, const cv::Size& sz,
             const VideoWriterParameters& params)
     {
         CV_Assert(plugin_api);
+        CV_Assert(plugin_api->v0.Writer_release);
+        CV_Assert(!filename.empty());
+
         CvPluginWriter writer = NULL;
-        if (plugin_api->Writer_open)
+
+        if (plugin_api->api_header.api_version >= 1 && plugin_api->v1.Writer_open_with_params)
         {
-            CV_Assert(plugin_api->Writer_release);
-            CV_Assert(!filename.empty());
-            const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
-            if (CV_ERROR_OK == plugin_api->Writer_open(filename.c_str(), fourcc, fps, sz.width, sz.height, isColor, &writer))
+            std::vector<int> vint_params = params.getIntVector();
+            int* c_params = &vint_params[0];
+            unsigned n_params = (unsigned)(vint_params.size() / 2);
+
+            if (CV_ERROR_OK == plugin_api->v1.Writer_open_with_params(filename.c_str(), fourcc, fps, sz.width, sz.height, c_params, n_params, &writer))
             {
                 CV_Assert(writer);
                 return makePtr<PluginWriter>(plugin_api, writer);
             }
         }
+        else if (plugin_api->v0.Writer_open)
+        {
+            const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
+            const int depth = params.get(VIDEOWRITER_PROP_DEPTH, CV_8U);
+            if (depth != CV_8U)
+            {
+                CV_LOG_WARNING(NULL, "Video I/O plugin doesn't support (due to lower API level) creation of VideoWriter with depth != CV_8U");
+                return Ptr<PluginWriter>();
+            }
+            if (params.warnUnusedParameters())
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO: unsupported parameters in VideoWriter, see logger INFO channel for details");
+                return Ptr<PluginWriter>();
+            }
+            if (CV_ERROR_OK == plugin_api->v0.Writer_open(filename.c_str(), fourcc, fps, sz.width, sz.height, isColor, &writer))
+            {
+                CV_Assert(writer);
+                return makePtr<PluginWriter>(plugin_api, writer);
+            }
+        }
+
         return Ptr<PluginWriter>();
     }
 
-    PluginWriter(const OpenCV_VideoIO_Plugin_API_preview* plugin_api, CvPluginWriter writer)
+    PluginWriter(const OpenCV_VideoIO_Writer_Plugin_API* plugin_api, CvPluginWriter writer)
         : plugin_api_(plugin_api), writer_(writer)
     {
         CV_Assert(plugin_api_); CV_Assert(writer_);
@@ -532,23 +572,23 @@ public:
 
     ~PluginWriter()
     {
-        CV_DbgAssert(plugin_api_->Writer_release);
-        if (CV_ERROR_OK != plugin_api_->Writer_release(writer_))
+        CV_DbgAssert(plugin_api_->v0.Writer_release);
+        if (CV_ERROR_OK != plugin_api_->v0.Writer_release(writer_))
             CV_LOG_ERROR(NULL, "Video I/O: Can't release writer by plugin '" << plugin_api_->api_header.api_description << "'");
         writer_ = NULL;
     }
     double getProperty(int prop) const CV_OVERRIDE
     {
         double val = -1;
-        if (plugin_api_->Writer_getProperty)
-            if (CV_ERROR_OK != plugin_api_->Writer_getProperty(writer_, prop, &val))
+        if (plugin_api_->v0.Writer_getProperty)
+            if (CV_ERROR_OK != plugin_api_->v0.Writer_getProperty(writer_, prop, &val))
                 val = -1;
         return val;
     }
     bool setProperty(int prop, double val) CV_OVERRIDE
     {
-        if (plugin_api_->Writer_setProperty)
-            if (CV_ERROR_OK == plugin_api_->Writer_setProperty(writer_, prop, val))
+        if (plugin_api_->v0.Writer_setProperty)
+            if (CV_ERROR_OK == plugin_api_->v0.Writer_setProperty(writer_, prop, val))
                 return true;
         return false;
     }
@@ -560,8 +600,8 @@ public:
     {
         cv::Mat img = arr.getMat();
         CV_DbgAssert(writer_);
-        CV_Assert(plugin_api_->Writer_write);
-        if (CV_ERROR_OK != plugin_api_->Writer_write(writer_, img.data, (int)img.step[0], img.cols, img.rows, img.channels()))
+        CV_Assert(plugin_api_->v0.Writer_write);
+        if (CV_ERROR_OK != plugin_api_->v0.Writer_write(writer_, img.data, (int)img.step[0], img.cols, img.rows, img.channels()))
         {
             CV_LOG_DEBUG(NULL, "Video I/O: Can't write frame by plugin '" << plugin_api_->api_header.api_description << "'");
         }
@@ -569,35 +609,55 @@ public:
     }
     int getCaptureDomain() const CV_OVERRIDE
     {
-        return plugin_api_->captureAPI;
+        return plugin_api_->v0.id;
     }
 };
 
 
-Ptr<IVideoCapture> PluginBackend::createCapture(int camera) const
+Ptr<IVideoCapture> PluginBackend::createCapture(int camera, const VideoCaptureParameters& params) const
 {
     try
     {
+        if (capture_api_)
+            return PluginCapture::create(capture_api_, std::string(), camera, params); //.staticCast<IVideoCapture>();
         if (plugin_api_)
-            return PluginCapture::create(plugin_api_, std::string(), camera); //.staticCast<IVideoCapture>();
+        {
+            Ptr<IVideoCapture> cap = legacy::PluginCapture::create(plugin_api_, std::string(), camera); //.staticCast<IVideoCapture>();
+            if (cap && !params.empty())
+            {
+                applyParametersFallback(cap, params);
+            }
+            return cap;
+        }
     }
     catch (...)
     {
         CV_LOG_DEBUG(NULL, "Video I/O: can't create camera capture: " << camera);
+        throw;
     }
     return Ptr<IVideoCapture>();
 }
 
-Ptr<IVideoCapture> PluginBackend::createCapture(const std::string &filename) const
+Ptr<IVideoCapture> PluginBackend::createCapture(const std::string &filename, const VideoCaptureParameters& params) const
 {
     try
     {
+        if (capture_api_)
+            return PluginCapture::create(capture_api_, filename, 0, params); //.staticCast<IVideoCapture>();
         if (plugin_api_)
-            return PluginCapture::create(plugin_api_, filename, 0); //.staticCast<IVideoCapture>();
+        {
+            Ptr<IVideoCapture> cap = legacy::PluginCapture::create(plugin_api_, filename, 0); //.staticCast<IVideoCapture>();
+            if (cap && !params.empty())
+            {
+                applyParametersFallback(cap, params);
+            }
+            return cap;
+        }
     }
     catch (...)
     {
         CV_LOG_DEBUG(NULL, "Video I/O: can't open file capture: " << filename);
+        throw;
     }
     return Ptr<IVideoCapture>();
 }
@@ -607,8 +667,10 @@ Ptr<IVideoWriter> PluginBackend::createWriter(const std::string& filename, int f
 {
     try
     {
+        if (writer_api_)
+            return PluginWriter::create(writer_api_, filename, fourcc, fps, sz, params); //.staticCast<IVideoWriter>();
         if (plugin_api_)
-            return PluginWriter::create(plugin_api_, filename, fourcc, fps, sz, params); //.staticCast<IVideoWriter>();
+            return legacy::PluginWriter::create(plugin_api_, filename, fourcc, fps, sz, params); //.staticCast<IVideoWriter>();
     }
     catch (...)
     {
@@ -617,11 +679,17 @@ Ptr<IVideoWriter> PluginBackend::createWriter(const std::string& filename, int f
     return Ptr<IVideoWriter>();
 }
 
+#endif  // OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+
 }  // namespace
 
 Ptr<IBackendFactory> createPluginBackendFactory(VideoCaptureAPIs id, const char* baseName)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
     return makePtr<impl::PluginBackendFactory>(id, baseName); //.staticCast<IBackendFactory>();
+#else
+    return Ptr<IBackendFactory>();
+#endif
 }
 
 }  // namespace
diff --git a/modules/videoio/src/backend_plugin_legacy.impl.hpp b/modules/videoio/src/backend_plugin_legacy.impl.hpp
new file mode 100644
index 0000000000..fa9def4cc5
--- /dev/null
+++ b/modules/videoio/src/backend_plugin_legacy.impl.hpp
@@ -0,0 +1,199 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header.
+//
+
+namespace cv { namespace impl { namespace legacy {
+
+//==================================================================================================
+
+class PluginCapture : public cv::IVideoCapture
+{
+    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;
+    CvPluginCapture capture_;
+
+public:
+    static
+    Ptr<PluginCapture> create(const OpenCV_VideoIO_Plugin_API_preview* plugin_api,
+            const std::string &filename, int camera)
+    {
+        CV_Assert(plugin_api);
+        CvPluginCapture capture = NULL;
+        if (plugin_api->v0.Capture_open)
+        {
+            CV_Assert(plugin_api->v0.Capture_release);
+            if (CV_ERROR_OK == plugin_api->v0.Capture_open(filename.empty() ? 0 : filename.c_str(), camera, &capture))
+            {
+                CV_Assert(capture);
+                return makePtr<PluginCapture>(plugin_api, capture);
+            }
+        }
+        return Ptr<PluginCapture>();
+    }
+
+    PluginCapture(const OpenCV_VideoIO_Plugin_API_preview* plugin_api, CvPluginCapture capture)
+        : plugin_api_(plugin_api), capture_(capture)
+    {
+        CV_Assert(plugin_api_); CV_Assert(capture_);
+    }
+
+    ~PluginCapture()
+    {
+        CV_DbgAssert(plugin_api_->v0.Capture_release);
+        if (CV_ERROR_OK != plugin_api_->v0.Capture_release(capture_))
+            CV_LOG_ERROR(NULL, "Video I/O: Can't release capture by plugin '" << plugin_api_->api_header.api_description << "'");
+        capture_ = NULL;
+    }
+    double getProperty(int prop) const CV_OVERRIDE
+    {
+        double val = -1;
+        if (plugin_api_->v0.Capture_getProperty)
+            if (CV_ERROR_OK != plugin_api_->v0.Capture_getProperty(capture_, prop, &val))
+                val = -1;
+        return val;
+    }
+    bool setProperty(int prop, double val) CV_OVERRIDE
+    {
+        if (plugin_api_->v0.Capture_setProperty)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_setProperty(capture_, prop, val))
+                return true;
+        return false;
+    }
+    bool grabFrame() CV_OVERRIDE
+    {
+        if (plugin_api_->v0.Capture_grab)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_grab(capture_))
+                return true;
+        return false;
+    }
+    static CvResult CV_API_CALL retrieve_callback(int stream_idx, const unsigned char* data, int step, int width, int height, int cn, void* userdata)
+    {
+        CV_UNUSED(stream_idx);
+        cv::_OutputArray* dst = static_cast<cv::_OutputArray*>(userdata);
+        if (!dst)
+            return CV_ERROR_FAIL;
+        cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, cn), (void*)data, step).copyTo(*dst);
+        return CV_ERROR_OK;
+    }
+    bool retrieveFrame(int idx, cv::OutputArray img) CV_OVERRIDE
+    {
+        bool res = false;
+        if (plugin_api_->v0.Capture_retreive)
+            if (CV_ERROR_OK == plugin_api_->v0.Capture_retreive(capture_, idx, retrieve_callback, (cv::_OutputArray*)&img))
+                res = true;
+        return res;
+    }
+    bool isOpened() const CV_OVERRIDE
+    {
+        return capture_ != NULL;  // TODO always true
+    }
+    int getCaptureDomain() CV_OVERRIDE
+    {
+        return plugin_api_->v0.captureAPI;
+    }
+};
+
+
+//==================================================================================================
+
+class PluginWriter : public cv::IVideoWriter
+{
+    const OpenCV_VideoIO_Plugin_API_preview* plugin_api_;
+    CvPluginWriter writer_;
+
+public:
+    static
+    Ptr<PluginWriter> create(const OpenCV_VideoIO_Plugin_API_preview* plugin_api,
+            const std::string& filename, int fourcc, double fps, const cv::Size& sz,
+            const VideoWriterParameters& params)
+    {
+        CV_Assert(plugin_api);
+        CvPluginWriter writer = NULL;
+        if (plugin_api->api_header.api_version >= 1 && plugin_api->v1.Writer_open_with_params)
+        {
+            CV_Assert(plugin_api->v0.Writer_release);
+            CV_Assert(!filename.empty());
+            std::vector<int> vint_params = params.getIntVector();
+            int* c_params = &vint_params[0];
+            unsigned n_params = (unsigned)(vint_params.size() / 2);
+
+            if (CV_ERROR_OK == plugin_api->v1.Writer_open_with_params(filename.c_str(), fourcc, fps, sz.width, sz.height, c_params, n_params, &writer))
+            {
+                CV_Assert(writer);
+                return makePtr<PluginWriter>(plugin_api, writer);
+            }
+        }
+        else if (plugin_api->v0.Writer_open)
+        {
+            CV_Assert(plugin_api->v0.Writer_release);
+            CV_Assert(!filename.empty());
+            const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
+            const int depth = params.get(VIDEOWRITER_PROP_DEPTH, CV_8U);
+            if (depth != CV_8U)
+            {
+                CV_LOG_WARNING(NULL, "Video I/O plugin doesn't support (due to lower API level) creation of VideoWriter with depth != CV_8U");
+                return Ptr<PluginWriter>();
+            }
+            if (CV_ERROR_OK == plugin_api->v0.Writer_open(filename.c_str(), fourcc, fps, sz.width, sz.height, isColor, &writer))
+            {
+                CV_Assert(writer);
+                return makePtr<PluginWriter>(plugin_api, writer);
+            }
+        }
+        return Ptr<PluginWriter>();
+    }
+
+    PluginWriter(const OpenCV_VideoIO_Plugin_API_preview* plugin_api, CvPluginWriter writer)
+        : plugin_api_(plugin_api), writer_(writer)
+    {
+        CV_Assert(plugin_api_); CV_Assert(writer_);
+    }
+
+    ~PluginWriter()
+    {
+        CV_DbgAssert(plugin_api_->v0.Writer_release);
+        if (CV_ERROR_OK != plugin_api_->v0.Writer_release(writer_))
+            CV_LOG_ERROR(NULL, "Video I/O: Can't release writer by plugin '" << plugin_api_->api_header.api_description << "'");
+        writer_ = NULL;
+    }
+    double getProperty(int prop) const CV_OVERRIDE
+    {
+        double val = -1;
+        if (plugin_api_->v0.Writer_getProperty)
+            if (CV_ERROR_OK != plugin_api_->v0.Writer_getProperty(writer_, prop, &val))
+                val = -1;
+        return val;
+    }
+    bool setProperty(int prop, double val) CV_OVERRIDE
+    {
+        if (plugin_api_->v0.Writer_setProperty)
+            if (CV_ERROR_OK == plugin_api_->v0.Writer_setProperty(writer_, prop, val))
+                return true;
+        return false;
+    }
+    bool isOpened() const CV_OVERRIDE
+    {
+        return writer_ != NULL;  // TODO always true
+    }
+    void write(cv::InputArray arr) CV_OVERRIDE
+    {
+        cv::Mat img = arr.getMat();
+        CV_DbgAssert(writer_);
+        CV_Assert(plugin_api_->v0.Writer_write);
+        if (CV_ERROR_OK != plugin_api_->v0.Writer_write(writer_, img.data, (int)img.step[0], img.cols, img.rows, img.channels()))
+        {
+            CV_LOG_DEBUG(NULL, "Video I/O: Can't write frame by plugin '" << plugin_api_->api_header.api_description << "'");
+        }
+        // TODO return bool result?
+    }
+    int getCaptureDomain() const CV_OVERRIDE
+    {
+        return plugin_api_->v0.captureAPI;
+    }
+};
+
+
+}}}  // namespace
diff --git a/modules/videoio/src/backend_static.cpp b/modules/videoio/src/backend_static.cpp
index 567628665b..2e0088f558 100644
--- a/modules/videoio/src/backend_static.cpp
+++ b/modules/videoio/src/backend_static.cpp
@@ -8,6 +8,29 @@
 
 namespace cv {
 
+
+void applyParametersFallback(const Ptr<IVideoCapture>& cap, const VideoCaptureParameters& params)
+{
+    std::vector<int> props = params.getUnused();
+    CV_LOG_INFO(NULL, "VIDEOIO: Backend '" << videoio_registry::getBackendName((VideoCaptureAPIs)cap->getCaptureDomain()) <<
+                      "' implementation doesn't support parameters in .open(). Applying " <<
+                      props.size() << " properties through .setProperty()");
+    for (int prop : props)
+    {
+        double value = params.get<double>(prop, -1);
+        CV_LOG_INFO(NULL, "VIDEOIO: apply parameter: [" << prop << "]=" <<
+                          cv::format("%g / %lld / 0x%016llx", value, (long long)value, (long long)value));
+        if (!cap->setProperty(prop, value))
+        {
+            if (prop != CAP_PROP_HW_ACCELERATION && prop != CAP_PROP_HW_DEVICE) { // optional parameters
+                CV_Error_(cv::Error::StsNotImplemented, ("VIDEOIO: Failed to apply invalid or unsupported parameter: [%d]=%g / %lld / 0x%08llx", prop, value, (long long)value, (long long)value));
+            }
+        }
+    }
+    // NB: there is no dedicated "commit" parameters event, implementations should commit after each property automatically
+}
+
+// Legacy API. Modern API with parameters is below
 class StaticBackend: public IBackend
 {
 public:
@@ -20,18 +43,33 @@ public:
     {
         // nothing
     }
+
     ~StaticBackend() CV_OVERRIDE {}
 
-    Ptr<IVideoCapture> createCapture(int camera) const CV_OVERRIDE
+    Ptr<IVideoCapture> createCapture(int camera, const VideoCaptureParameters& params) const CV_OVERRIDE
     {
         if (fn_createCaptureCamera_)
-            return fn_createCaptureCamera_(camera);
+        {
+            Ptr<IVideoCapture> cap = fn_createCaptureCamera_(camera);
+            if (cap && !params.empty())
+            {
+                applyParametersFallback(cap, params);
+            }
+            return cap;
+        }
         return Ptr<IVideoCapture>();
     }
-    Ptr<IVideoCapture> createCapture(const std::string &filename) const CV_OVERRIDE
+    Ptr<IVideoCapture> createCapture(const std::string &filename, const VideoCaptureParameters& params) const CV_OVERRIDE
     {
         if (fn_createCaptureFile_)
-            return fn_createCaptureFile_(filename);
+        {
+            Ptr<IVideoCapture> cap = fn_createCaptureFile_(filename);
+            if (cap && !params.empty())
+            {
+                applyParametersFallback(cap, params);
+            }
+            return cap;
+        }
         return Ptr<IVideoCapture>();
     }
     Ptr<IVideoWriter> createWriter(const std::string& filename, int fourcc, double fps,
@@ -63,6 +101,7 @@ public:
     }
 };
 
+
 Ptr<IBackendFactory> createBackendFactory(FN_createCaptureFile createCaptureFile,
                                           FN_createCaptureCamera createCaptureCamera,
                                           FN_createWriter createWriter)
@@ -70,4 +109,71 @@ Ptr<IBackendFactory> createBackendFactory(FN_createCaptureFile createCaptureFile
     return makePtr<StaticBackendFactory>(createCaptureFile, createCaptureCamera, createWriter).staticCast<IBackendFactory>();
 }
 
+
+
+class StaticBackendWithParams: public IBackend
+{
+public:
+    FN_createCaptureFileWithParams fn_createCaptureFile_;
+    FN_createCaptureCameraWithParams fn_createCaptureCamera_;
+    FN_createWriter fn_createWriter_;
+
+    StaticBackendWithParams(FN_createCaptureFileWithParams fn_createCaptureFile, FN_createCaptureCameraWithParams fn_createCaptureCamera, FN_createWriter fn_createWriter)
+        : fn_createCaptureFile_(fn_createCaptureFile), fn_createCaptureCamera_(fn_createCaptureCamera), fn_createWriter_(fn_createWriter)
+    {
+        // nothing
+    }
+
+    ~StaticBackendWithParams() CV_OVERRIDE {}
+
+    Ptr<IVideoCapture> createCapture(int camera, const VideoCaptureParameters& params) const CV_OVERRIDE
+    {
+        if (fn_createCaptureCamera_)
+            return fn_createCaptureCamera_(camera, params);
+        return Ptr<IVideoCapture>();
+    }
+    Ptr<IVideoCapture> createCapture(const std::string &filename, const VideoCaptureParameters& params) const CV_OVERRIDE
+    {
+        if (fn_createCaptureFile_)
+            return fn_createCaptureFile_(filename, params);
+        return Ptr<IVideoCapture>();
+    }
+    Ptr<IVideoWriter> createWriter(const std::string& filename, int fourcc, double fps,
+                                   const cv::Size& sz, const VideoWriterParameters& params) const CV_OVERRIDE
+    {
+        if (fn_createWriter_)
+            return fn_createWriter_(filename, fourcc, fps, sz, params);
+        return Ptr<IVideoWriter>();
+    }
+}; // StaticBackendWithParams
+
+class StaticBackendWithParamsFactory : public IBackendFactory
+{
+protected:
+    Ptr<StaticBackendWithParams> backend;
+
+public:
+    StaticBackendWithParamsFactory(FN_createCaptureFileWithParams createCaptureFile, FN_createCaptureCameraWithParams createCaptureCamera, FN_createWriter createWriter)
+        : backend(makePtr<StaticBackendWithParams>(createCaptureFile, createCaptureCamera, createWriter))
+    {
+        // nothing
+    }
+
+    ~StaticBackendWithParamsFactory() CV_OVERRIDE {}
+
+    Ptr<IBackend> getBackend() const CV_OVERRIDE
+    {
+        return backend.staticCast<IBackend>();
+    }
+};
+
+
+Ptr<IBackendFactory> createBackendFactory(FN_createCaptureFileWithParams createCaptureFile,
+                                          FN_createCaptureCameraWithParams createCaptureCamera,
+                                          FN_createWriter createWriter)
+{
+    return makePtr<StaticBackendWithParamsFactory>(createCaptureFile, createCaptureCamera, createWriter).staticCast<IBackendFactory>();
+}
+
+
 } // namespace
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index 77a2c3671e..f8967514ed 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -75,12 +75,26 @@ VideoCapture::VideoCapture(const String& filename, int apiPreference) : throwOnF
     open(filename, apiPreference);
 }
 
+VideoCapture::VideoCapture(const String& filename, int apiPreference, const std::vector<int>& params)
+    : throwOnFail(false)
+{
+    CV_TRACE_FUNCTION();
+    open(filename, apiPreference, params);
+}
+
 VideoCapture::VideoCapture(int index, int apiPreference) : throwOnFail(false)
 {
     CV_TRACE_FUNCTION();
     open(index, apiPreference);
 }
 
+VideoCapture::VideoCapture(int index, int apiPreference, const std::vector<int>& params)
+    : throwOnFail(false)
+{
+    CV_TRACE_FUNCTION();
+    open(index, apiPreference, params);
+}
+
 VideoCapture::~VideoCapture()
 {
     CV_TRACE_FUNCTION();
@@ -89,13 +103,19 @@ VideoCapture::~VideoCapture()
 
 bool VideoCapture::open(const String& filename, int apiPreference)
 {
-    CV_TRACE_FUNCTION();
+    return open(filename, apiPreference, std::vector<int>());
+}
+
+bool VideoCapture::open(const String& filename, int apiPreference, const std::vector<int>& params)
+{
+    CV_INSTRUMENT_REGION();
 
     if (isOpened())
     {
         release();
     }
 
+    const VideoCaptureParameters parameters(params);
     const std::vector<VideoBackendInfo> backends = cv::videoio_registry::getAvailableBackends_CaptureByFilename();
     for (size_t i = 0; i < backends.size(); i++)
     {
@@ -112,7 +132,7 @@ bool VideoCapture::open(const String& filename, int apiPreference)
             {
                 try
                 {
-                    icap = backend->createCapture(filename);
+                    icap = backend->createCapture(filename, parameters);
                     if (!icap.empty())
                     {
                         CV_CAPTURE_LOG_DEBUG(NULL,
@@ -181,6 +201,11 @@ bool VideoCapture::open(const String& filename, int apiPreference)
 }
 
 bool VideoCapture::open(int cameraNum, int apiPreference)
+{
+    return open(cameraNum, apiPreference, std::vector<int>());
+}
+
+bool VideoCapture::open(int cameraNum, int apiPreference, const std::vector<int>& params)
 {
     CV_TRACE_FUNCTION();
 
@@ -200,6 +225,7 @@ bool VideoCapture::open(int cameraNum, int apiPreference)
         }
     }
 
+    const VideoCaptureParameters parameters(params);
     const std::vector<VideoBackendInfo> backends = cv::videoio_registry::getAvailableBackends_CaptureByIndex();
     for (size_t i = 0; i < backends.size(); i++)
     {
@@ -216,7 +242,7 @@ bool VideoCapture::open(int cameraNum, int apiPreference)
             {
                 try
                 {
-                    icap = backend->createCapture(cameraNum);
+                    icap = backend->createCapture(cameraNum, parameters);
                     if (!icap.empty())
                     {
                         CV_CAPTURE_LOG_DEBUG(NULL,
diff --git a/modules/videoio/src/cap_android_camera.cpp b/modules/videoio/src/cap_android_camera.cpp
new file mode 100644
index 0000000000..b369a12a68
--- /dev/null
+++ b/modules/videoio/src/cap_android_camera.cpp
@@ -0,0 +1,770 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+// Contributed by Giles Payne
+
+#include "precomp.hpp"
+
+#include <memory>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <chrono>
+#include <android/log.h>
+#include <camera/NdkCameraManager.h>
+#include <camera/NdkCameraError.h>
+#include <camera/NdkCameraDevice.h>
+#include <camera/NdkCameraMetadataTags.h>
+#include <media/NdkImageReader.h>
+
+using namespace cv;
+
+#define TAG "NativeCamera"
+#define LOGV(...) __android_log_print(ANDROID_LOG_VERBOSE, TAG, __VA_ARGS__)
+#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
+#define LOGW(...) __android_log_print(ANDROID_LOG_WARN, TAG, __VA_ARGS__)
+#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
+
+#define MAX_BUF_COUNT 4
+
+#define COLOR_FormatUnknown -1
+#define COLOR_FormatYUV420Planar 19
+#define COLOR_FormatYUV420SemiPlanar 21
+
+#define FOURCC_BGR CV_FOURCC_MACRO('B','G','R','3')
+#define FOURCC_RGB CV_FOURCC_MACRO('R','G','B','3')
+#define FOURCC_GRAY CV_FOURCC_MACRO('G','R','E','Y')
+#define FOURCC_NV21 CV_FOURCC_MACRO('N','V','2','1')
+#define FOURCC_YV12 CV_FOURCC_MACRO('Y','V','1','2')
+#define FOURCC_UNKNOWN  0xFFFFFFFF
+
+template <typename T> class RangeValue {
+public:
+    T min, max;
+    /**
+     * return absolute value from relative value
+     * * value: in percent (50 for 50%)
+     * */
+    T value(int percent) {
+        return static_cast<T>(min + (max - min) * percent / 100);
+    }
+    RangeValue() { min = max = static_cast<T>(0); }
+    bool Supported(void) const { return (min != max); }
+};
+
+static inline void deleter_ACameraManager(ACameraManager *cameraManager) {
+    ACameraManager_delete(cameraManager);
+}
+
+static inline void deleter_ACameraIdList(ACameraIdList *cameraIdList) {
+    ACameraManager_deleteCameraIdList(cameraIdList);
+}
+
+static inline void deleter_ACameraDevice(ACameraDevice *cameraDevice) {
+    ACameraDevice_close(cameraDevice);
+}
+
+static inline void deleter_ACameraMetadata(ACameraMetadata *cameraMetadata) {
+    ACameraMetadata_free(cameraMetadata);
+}
+
+static inline void deleter_AImageReader(AImageReader *imageReader) {
+    AImageReader_delete(imageReader);
+}
+
+static inline void deleter_ACaptureSessionOutputContainer(ACaptureSessionOutputContainer *outputContainer) {
+    ACaptureSessionOutputContainer_free(outputContainer);
+}
+
+static inline void deleter_ACameraCaptureSession(ACameraCaptureSession *captureSession) {
+    ACameraCaptureSession_close(captureSession);
+}
+
+static inline void deleter_AImage(AImage *image) {
+    AImage_delete(image);
+}
+
+static inline void deleter_ANativeWindow(ANativeWindow *nativeWindow) {
+    ANativeWindow_release(nativeWindow);
+}
+
+static inline void deleter_ACaptureSessionOutput(ACaptureSessionOutput *sessionOutput) {
+    ACaptureSessionOutput_free(sessionOutput);
+}
+
+static inline void deleter_ACameraOutputTarget(ACameraOutputTarget *outputTarget) {
+    ACameraOutputTarget_free(outputTarget);
+}
+
+static inline void deleter_ACaptureRequest(ACaptureRequest *captureRequest) {
+    ACaptureRequest_free(captureRequest);
+}
+
+/*
+ * CameraDevice callbacks
+ */
+static void OnDeviceDisconnect(void* /* ctx */, ACameraDevice* dev) {
+    std::string id(ACameraDevice_getId(dev));
+    LOGW("Device %s disconnected", id.c_str());
+}
+
+static void OnDeviceError(void* /* ctx */, ACameraDevice* dev, int err) {
+    std::string id(ACameraDevice_getId(dev));
+    LOGI("Camera Device Error: %#x, Device %s", err, id.c_str());
+
+    switch (err) {
+        case ERROR_CAMERA_IN_USE:
+            LOGI("Camera in use");
+            break;
+        case ERROR_CAMERA_SERVICE:
+            LOGI("Fatal Error occured in Camera Service");
+            break;
+        case ERROR_CAMERA_DEVICE:
+            LOGI("Fatal Error occured in Camera Device");
+            break;
+        case ERROR_CAMERA_DISABLED:
+            LOGI("Camera disabled");
+            break;
+        case ERROR_MAX_CAMERAS_IN_USE:
+            LOGI("System limit for maximum concurrent cameras used was exceeded");
+            break;
+        default:
+            LOGI("Unknown Camera Device Error: %#x", err);
+    }
+}
+
+enum class CaptureSessionState {
+    INITIALIZING,  // session is ready
+    READY,         // session is ready
+    ACTIVE,        // session is busy
+    CLOSED         // session was closed
+};
+
+void OnSessionClosed(void* context, ACameraCaptureSession* session);
+
+void OnSessionReady(void* context, ACameraCaptureSession* session);
+
+void OnSessionActive(void* context, ACameraCaptureSession* session);
+
+void OnCaptureCompleted(void* context,
+                        ACameraCaptureSession* session,
+                        ACaptureRequest* request,
+                        const ACameraMetadata* result);
+
+void OnCaptureFailed(void* context,
+                     ACameraCaptureSession* session,
+                     ACaptureRequest* request,
+                     ACameraCaptureFailure* failure);
+
+#define CAPTURE_TIMEOUT_SECONDS 2
+
+/**
+ * Range of Camera Exposure Time:
+ *     Camera's capability range have a very long range which may be disturbing
+ *     on camera. For this sample purpose, clamp to a range showing visible
+ *     video on preview: 100000ns ~ 250000000ns
+ */
+static const long kMinExposureTime = 1000000L;
+static const long kMaxExposureTime = 250000000L;
+
+class AndroidCameraCapture : public IVideoCapture
+{
+    int cachedIndex;
+    std::shared_ptr<ACameraManager> cameraManager;
+    std::shared_ptr<ACameraDevice> cameraDevice;
+    std::shared_ptr<AImageReader> imageReader;
+    std::shared_ptr<ACaptureSessionOutputContainer> outputContainer;
+    std::shared_ptr<ACaptureSessionOutput> sessionOutput;
+    std::shared_ptr<ANativeWindow> nativeWindow;
+    std::shared_ptr<ACameraOutputTarget> outputTarget;
+    std::shared_ptr<ACaptureRequest> captureRequest;
+    std::shared_ptr<ACameraCaptureSession> captureSession;
+    CaptureSessionState sessionState = CaptureSessionState::INITIALIZING;
+    int32_t frameWidth = 0;
+    int32_t frameHeight = 0;
+    int32_t colorFormat;
+    std::vector<uint8_t> buffer;
+    bool sessionOutputAdded = false;
+    bool targetAdded = false;
+    // properties
+    uint32_t fourCC = FOURCC_UNKNOWN;
+    bool settingWidth = false;
+    bool settingHeight = false;
+    int desiredWidth = 640;
+    int desiredHeight = 480;
+    bool autoExposure = true;
+    int64_t exposureTime = 0L;
+    RangeValue<int64_t> exposureRange;
+    int32_t sensitivity = 0;
+    RangeValue<int32_t> sensitivityRange;
+
+public:
+    // for synchronization with NDK capture callback
+    bool waitingCapture = false;
+    bool captureSuccess = false;
+    std::mutex mtx;
+    std::condition_variable condition;
+
+public:
+    AndroidCameraCapture() {}
+
+    ~AndroidCameraCapture() { cleanUp(); }
+
+    ACameraDevice_stateCallbacks* GetDeviceListener() {
+        static ACameraDevice_stateCallbacks cameraDeviceListener = {
+            .onDisconnected = ::OnDeviceDisconnect,
+            .onError = ::OnDeviceError,
+        };
+        return &cameraDeviceListener;
+    }
+
+    ACameraCaptureSession_stateCallbacks sessionListener;
+
+    ACameraCaptureSession_stateCallbacks* GetSessionListener() {
+        sessionListener = {
+            .context = this,
+            .onClosed = ::OnSessionClosed,
+            .onReady = ::OnSessionReady,
+            .onActive = ::OnSessionActive,
+        };
+        return &sessionListener;
+    }
+
+    ACameraCaptureSession_captureCallbacks captureListener;
+
+    ACameraCaptureSession_captureCallbacks* GetCaptureCallback() {
+        captureListener = {
+            .context = this,
+            .onCaptureStarted = nullptr,
+            .onCaptureProgressed = nullptr,
+            .onCaptureCompleted = ::OnCaptureCompleted,
+            .onCaptureFailed = ::OnCaptureFailed,
+            .onCaptureSequenceCompleted = nullptr,
+            .onCaptureSequenceAborted = nullptr,
+            .onCaptureBufferLost = nullptr,
+        };
+        return &captureListener;
+    }
+
+    void setSessionState(CaptureSessionState newSessionState) {
+        this->sessionState = newSessionState;
+    }
+
+    bool isOpened() const CV_OVERRIDE { return imageReader.get() != nullptr && captureSession.get() != nullptr; }
+
+    int getCaptureDomain() CV_OVERRIDE { return CAP_ANDROID; }
+
+    bool grabFrame() CV_OVERRIDE
+    {
+        AImage* img;
+        {
+            std::unique_lock<std::mutex> lock(mtx);
+            media_status_t mStatus = AImageReader_acquireLatestImage(imageReader.get(), &img);
+            if (mStatus != AMEDIA_OK) {
+                if (mStatus == AMEDIA_IMGREADER_NO_BUFFER_AVAILABLE) {
+                    // this error is not fatal - we just need to wait for a buffer to become available
+                    LOGW("No Buffer Available error occured - waiting for callback");
+                    waitingCapture = true;
+                    captureSuccess = false;
+                    bool captured = condition.wait_for(lock, std::chrono::seconds(CAPTURE_TIMEOUT_SECONDS), [this]{ return captureSuccess; });
+                    waitingCapture = false;
+                    if (captured) {
+                        mStatus = AImageReader_acquireLatestImage(imageReader.get(), &img);
+                        if (mStatus != AMEDIA_OK) {
+                            LOGE("Acquire image failed with error code: %d", mStatus);
+                            return false;
+                        }
+                    } else {
+                        LOGE("Capture failed or callback timed out");
+                        return false;
+                    }
+                } else {
+                    LOGE("Acquire image failed with error code: %d", mStatus);
+                    return false;
+                }
+            }
+        }
+        std::shared_ptr<AImage> image = std::shared_ptr<AImage>(img, deleter_AImage);
+        int32_t srcFormat = -1;
+        AImage_getFormat(image.get(), &srcFormat);
+        if (srcFormat != AIMAGE_FORMAT_YUV_420_888) {
+            LOGE("Incorrect image format");
+            return false;
+        }
+        int32_t srcPlanes = 0;
+        AImage_getNumberOfPlanes(image.get(), &srcPlanes);
+        if (srcPlanes != 3) {
+            LOGE("Incorrect number of planes in image data");
+            return false;
+        }
+        int32_t yStride, uvStride;
+        uint8_t *yPixel, *uPixel, *vPixel;
+        int32_t yLen, uLen, vLen;
+        int32_t uvPixelStride;
+        AImage_getPlaneRowStride(image.get(), 0, &yStride);
+        AImage_getPlaneRowStride(image.get(), 1, &uvStride);
+        AImage_getPlaneData(image.get(), 0, &yPixel, &yLen);
+        AImage_getPlaneData(image.get(), 1, &vPixel, &vLen);
+        AImage_getPlaneData(image.get(), 2, &uPixel, &uLen);
+        AImage_getPlanePixelStride(image.get(), 1, &uvPixelStride);
+
+        if ( (uvPixelStride == 2) && (vPixel == uPixel + 1) && (yLen == frameWidth * frameHeight) && (uLen == ((yLen / 2) - 1)) && (vLen == uLen) ) {
+            colorFormat = COLOR_FormatYUV420SemiPlanar;
+            if (fourCC == FOURCC_UNKNOWN) {
+                fourCC = FOURCC_NV21;
+            }
+        } else if ( (uvPixelStride == 1) && (vPixel = uPixel + uLen) && (yLen == frameWidth * frameHeight) && (uLen == yLen / 4) && (vLen == uLen) ) {
+            colorFormat = COLOR_FormatYUV420Planar;
+            if (fourCC == FOURCC_UNKNOWN) {
+                fourCC = FOURCC_YV12;
+            }
+        } else {
+            colorFormat = COLOR_FormatUnknown;
+            fourCC = FOURCC_UNKNOWN;
+            LOGE("Unsupported format");
+            return false;
+        }
+
+        buffer.clear();
+        buffer.insert(buffer.end(), yPixel, yPixel + yLen);
+        buffer.insert(buffer.end(), uPixel, uPixel + yLen / 2);
+        return true;
+    }
+
+    bool retrieveFrame(int, OutputArray out) CV_OVERRIDE
+    {
+        if (buffer.empty()) {
+            return false;
+        }
+        Mat yuv(frameHeight + frameHeight/2, frameWidth, CV_8UC1, buffer.data());
+        if (colorFormat == COLOR_FormatYUV420Planar) {
+            switch (fourCC) {
+                case FOURCC_BGR:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2BGR_YV12);
+                    break;
+                case FOURCC_RGB:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2RGB_YV12);
+                    break;
+                case FOURCC_GRAY:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2GRAY_YV12);
+                    break;
+                case FOURCC_YV12:
+                    yuv.copyTo(out);
+                    break;
+                default:
+                    LOGE("Unexpected FOURCC value: %d", fourCC);
+                    break;
+            }
+        } else if (colorFormat == COLOR_FormatYUV420SemiPlanar) {
+            switch (fourCC) {
+                case FOURCC_BGR:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2BGR_NV21);
+                    break;
+                case FOURCC_RGB:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2RGB_NV21);
+                    break;
+                case FOURCC_GRAY:
+                    cv::cvtColor(yuv, out, cv::COLOR_YUV2GRAY_NV21);
+                    break;
+                case FOURCC_NV21:
+                    yuv.copyTo(out);
+                    break;
+                default:
+                    LOGE("Unexpected FOURCC value: %d", fourCC);
+                    break;
+            }
+        } else {
+            LOGE("Unsupported video format: %d", colorFormat);
+            return false;
+        }
+        return true;
+    }
+
+    double getProperty(int property_id) const CV_OVERRIDE
+    {
+        switch (property_id) {
+            case CV_CAP_PROP_FRAME_WIDTH:
+                return isOpened() ? frameWidth : desiredWidth;
+            case CV_CAP_PROP_FRAME_HEIGHT:
+                return isOpened() ? frameHeight : desiredHeight;
+            case CAP_PROP_AUTO_EXPOSURE:
+                return autoExposure ? 1 : 0;
+            case CV_CAP_PROP_EXPOSURE:
+                return exposureTime;
+            case CV_CAP_PROP_ISO_SPEED:
+                return sensitivity;
+            case CV_CAP_PROP_FOURCC:
+                return fourCC;
+            default:
+                break;
+        }
+        // unknown parameter or value not available
+        return -1;
+    }
+
+    bool setProperty(int property_id, double value) CV_OVERRIDE
+    {
+        switch (property_id) {
+            case CV_CAP_PROP_FRAME_WIDTH:
+                desiredWidth = value;
+                settingWidth = true;
+                if (settingWidth && settingHeight) {
+                    setWidthHeight();
+                    settingWidth = false;
+                    settingHeight = false;
+                }
+                return true;
+            case CV_CAP_PROP_FRAME_HEIGHT:
+                desiredHeight = value;
+                settingHeight = true;
+                if (settingWidth && settingHeight) {
+                    setWidthHeight();
+                    settingWidth = false;
+                    settingHeight = false;
+                }
+                return true;
+            case CV_CAP_PROP_FOURCC:
+                {
+                    uint32_t newFourCC = cvRound(value);
+                    if (fourCC == newFourCC) {
+                        return true;
+                    } else {
+                        switch (newFourCC) {
+                            case FOURCC_BGR:
+                            case FOURCC_RGB:
+                            case FOURCC_GRAY:
+                                fourCC = newFourCC;
+                                return true;
+                            case FOURCC_YV12:
+                                if (colorFormat == COLOR_FormatYUV420Planar) {
+                                    fourCC = newFourCC;
+                                    return true;
+                                } else {
+                                    LOGE("Unsupported FOURCC conversion COLOR_FormatYUV420SemiPlanar -> COLOR_FormatYUV420Planar");
+                                    return false;
+                                }
+                            case FOURCC_NV21:
+                                if (colorFormat == COLOR_FormatYUV420SemiPlanar) {
+                                    fourCC = newFourCC;
+                                    return true;
+                                } else {
+                                    LOGE("Unsupported FOURCC conversion COLOR_FormatYUV420Planar -> COLOR_FormatYUV420SemiPlanar");
+                                    return false;
+                                }
+                            default:
+                                LOGE("Unsupported FOURCC value: %d\n", fourCC);
+                                return false;
+                        }
+                    }
+                }
+            case CAP_PROP_AUTO_EXPOSURE:
+                autoExposure = (value != 0);
+                if (isOpened()) {
+                    uint8_t aeMode = autoExposure ? ACAMERA_CONTROL_AE_MODE_ON : ACAMERA_CONTROL_AE_MODE_OFF;
+                    camera_status_t status = ACaptureRequest_setEntry_u8(captureRequest.get(), ACAMERA_CONTROL_AE_MODE, 1, &aeMode);
+                    return status == ACAMERA_OK;
+                }
+                return true;
+            case CV_CAP_PROP_EXPOSURE:
+                if (isOpened() && exposureRange.Supported()) {
+                    exposureTime = (int64_t)value;
+                    LOGI("Setting CV_CAP_PROP_EXPOSURE will have no effect unless CAP_PROP_AUTO_EXPOSURE is off");
+                    camera_status_t status = ACaptureRequest_setEntry_i64(captureRequest.get(), ACAMERA_SENSOR_EXPOSURE_TIME, 1, &exposureTime);
+                    return status == ACAMERA_OK;
+                }
+                return false;
+            case CV_CAP_PROP_ISO_SPEED:
+                if (isOpened() && sensitivityRange.Supported()) {
+                    sensitivity = (int32_t)value;
+                    LOGI("Setting CV_CAP_PROP_ISO_SPEED will have no effect unless CAP_PROP_AUTO_EXPOSURE is off");
+                    camera_status_t status = ACaptureRequest_setEntry_i32(captureRequest.get(), ACAMERA_SENSOR_SENSITIVITY, 1, &sensitivity);
+                    return status == ACAMERA_OK;
+                }
+                return false;
+            default:
+                break;
+        }
+        return false;
+    }
+
+    void setWidthHeight() {
+        cleanUp();
+        initCapture(cachedIndex);
+    }
+
+    // calculate a score based on how well the width and height match the desired width and height
+    // basically draw the 2 rectangle on top of each other and take the ratio of the non-overlapping
+    // area to the overlapping area
+    double getScore(int32_t width, int32_t height) {
+        double area1 = width * height;
+        double area2 = desiredWidth * desiredHeight;
+        if ((width < desiredWidth) == (height < desiredHeight)) {
+            return (width < desiredWidth) ? (area2 - area1)/area1 : (area1 - area2)/area2;
+        } else {
+            int32_t overlappedWidth = std::min(width, desiredWidth);
+            int32_t overlappedHeight = std::min(height, desiredHeight);
+            double overlappedArea = overlappedWidth * overlappedHeight;
+            return (area1 + area2 - overlappedArea)/overlappedArea;
+        }
+    }
+
+    bool initCapture(int index)
+    {
+        cachedIndex = index;
+        cameraManager = std::shared_ptr<ACameraManager>(ACameraManager_create(), deleter_ACameraManager);
+        if (!cameraManager) {
+            return false;
+        }
+        ACameraIdList* cameraIds = nullptr;
+        camera_status_t cStatus = ACameraManager_getCameraIdList(cameraManager.get(), &cameraIds);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("Get camera list failed with error code: %d", cStatus);
+            return false;
+        }
+        std::shared_ptr<ACameraIdList> cameraIdList = std::shared_ptr<ACameraIdList>(cameraIds, deleter_ACameraIdList);
+        if (index < 0 || index >= cameraIds->numCameras) {
+            LOGE("Camera index out of range %d (Number of cameras: %d)", index, cameraIds->numCameras);
+            return false;
+        }
+        ACameraDevice* camera = nullptr;
+        cStatus = ACameraManager_openCamera(cameraManager.get(), cameraIdList.get()->cameraIds[index], GetDeviceListener(), &camera);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("Open camera failed with error code: %d", cStatus);
+            return false;
+        }
+        cameraDevice = std::shared_ptr<ACameraDevice>(camera, deleter_ACameraDevice);
+        ACameraMetadata* metadata;
+        cStatus = ACameraManager_getCameraCharacteristics(cameraManager.get(), cameraIdList.get()->cameraIds[index], &metadata);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("Get camera characteristics failed with error code: %d", cStatus);
+            return false;
+        }
+        std::shared_ptr<ACameraMetadata> cameraMetadata = std::shared_ptr<ACameraMetadata>(metadata, deleter_ACameraMetadata);
+        ACameraMetadata_const_entry entry;
+        ACameraMetadata_getConstEntry(cameraMetadata.get(), ACAMERA_SCALER_AVAILABLE_STREAM_CONFIGURATIONS, &entry);
+
+        double bestScore = std::numeric_limits<double>::max();
+        int32_t bestMatchWidth = 0;
+        int32_t bestMatchHeight = 0;
+
+        for (uint32_t i = 0; i < entry.count; i += 4) {
+            int32_t input = entry.data.i32[i + 3];
+            int32_t format = entry.data.i32[i + 0];
+            if (input) {
+                continue;
+            }
+            if (format == AIMAGE_FORMAT_YUV_420_888) {
+                int32_t width = entry.data.i32[i + 1];
+                int32_t height = entry.data.i32[i + 2];
+                if (width == desiredWidth && height == desiredHeight) {
+                    bestMatchWidth = width;
+                    bestMatchHeight = height;
+                    bestScore = 0;
+                    break;
+                } else {
+                    double score = getScore(width, height);
+                    if (score < bestScore) {
+                        bestMatchWidth = width;
+                        bestMatchHeight = height;
+                        bestScore = score;
+                    }
+                }
+            }
+        }
+
+        ACameraMetadata_const_entry val = { 0, };
+        camera_status_t status = ACameraMetadata_getConstEntry(cameraMetadata.get(), ACAMERA_SENSOR_INFO_EXPOSURE_TIME_RANGE, &val);
+        if (status == ACAMERA_OK) {
+            exposureRange.min = val.data.i64[0];
+            if (exposureRange.min < kMinExposureTime) {
+                exposureRange.min = kMinExposureTime;
+            }
+            exposureRange.max = val.data.i64[1];
+            if (exposureRange.max > kMaxExposureTime) {
+                exposureRange.max = kMaxExposureTime;
+            }
+            exposureTime = exposureRange.value(2);
+        } else {
+            LOGW("Unsupported ACAMERA_SENSOR_INFO_EXPOSURE_TIME_RANGE");
+            exposureRange.min = exposureRange.max = 0l;
+            exposureTime = 0l;
+        }
+        status = ACameraMetadata_getConstEntry(cameraMetadata.get(), ACAMERA_SENSOR_INFO_SENSITIVITY_RANGE, &val);
+        if (status == ACAMERA_OK){
+            sensitivityRange.min = val.data.i32[0];
+            sensitivityRange.max = val.data.i32[1];
+            sensitivity = sensitivityRange.value(2);
+        } else {
+            LOGW("Unsupported ACAMERA_SENSOR_INFO_SENSITIVITY_RANGE");
+            sensitivityRange.min = sensitivityRange.max = 0;
+            sensitivity = 0;
+        }
+
+        AImageReader* reader;
+        media_status_t mStatus = AImageReader_new(bestMatchWidth, bestMatchHeight, AIMAGE_FORMAT_YUV_420_888, MAX_BUF_COUNT, &reader);
+        if (mStatus != AMEDIA_OK) {
+            LOGE("ImageReader creation failed with error code: %d", mStatus);
+            return false;
+        }
+        frameWidth = bestMatchWidth;
+        frameHeight = bestMatchHeight;
+        imageReader = std::shared_ptr<AImageReader>(reader, deleter_AImageReader);
+
+        ANativeWindow *window;
+        mStatus = AImageReader_getWindow(imageReader.get(), &window);
+        if (mStatus != AMEDIA_OK) {
+            LOGE("Could not get ANativeWindow: %d", mStatus);
+            return false;
+        }
+        nativeWindow = std::shared_ptr<ANativeWindow>(window, deleter_ANativeWindow);
+
+        ACaptureSessionOutputContainer* container;
+        cStatus = ACaptureSessionOutputContainer_create(&container);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CaptureSessionOutputContainer creation failed with error code: %d", cStatus);
+            return false;
+        }
+        outputContainer = std::shared_ptr<ACaptureSessionOutputContainer>(container, deleter_ACaptureSessionOutputContainer);
+
+        ANativeWindow_acquire(nativeWindow.get());
+        ACaptureSessionOutput* output;
+        cStatus = ACaptureSessionOutput_create(nativeWindow.get(), &output);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CaptureSessionOutput creation failed with error code: %d", cStatus);
+            return false;
+        }
+        sessionOutput = std::shared_ptr<ACaptureSessionOutput>(output, deleter_ACaptureSessionOutput);
+        ACaptureSessionOutputContainer_add(outputContainer.get(), sessionOutput.get());
+        sessionOutputAdded = true;
+
+        ACameraOutputTarget* target;
+        cStatus = ACameraOutputTarget_create(nativeWindow.get(), &target);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CameraOutputTarget creation failed with error code: %d", cStatus);
+            return false;
+        }
+        outputTarget = std::shared_ptr<ACameraOutputTarget>(target, deleter_ACameraOutputTarget);
+
+        ACaptureRequest * request;
+        cStatus = ACameraDevice_createCaptureRequest(cameraDevice.get(), TEMPLATE_PREVIEW, &request);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CaptureRequest creation failed with error code: %d", cStatus);
+            return false;
+        }
+        captureRequest = std::shared_ptr<ACaptureRequest>(request, deleter_ACaptureRequest);
+
+        cStatus = ACaptureRequest_addTarget(captureRequest.get(), outputTarget.get());
+        if (cStatus != ACAMERA_OK) {
+            LOGE("Add target to CaptureRequest failed with error code: %d", cStatus);
+            return false;
+        }
+        targetAdded = true;
+
+        ACameraCaptureSession *session;
+        cStatus = ACameraDevice_createCaptureSession(cameraDevice.get(), outputContainer.get(), GetSessionListener(), &session);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CaptureSession creation failed with error code: %d", cStatus);
+            return false;
+        }
+        captureSession = std::shared_ptr<ACameraCaptureSession>(session, deleter_ACameraCaptureSession);
+        uint8_t aeMode = autoExposure ? ACAMERA_CONTROL_AE_MODE_ON : ACAMERA_CONTROL_AE_MODE_OFF;
+        ACaptureRequest_setEntry_u8(captureRequest.get(), ACAMERA_CONTROL_AE_MODE, 1, &aeMode);
+        ACaptureRequest_setEntry_i32(captureRequest.get(), ACAMERA_SENSOR_SENSITIVITY, 1, &sensitivity);
+        if (!autoExposure) {
+            ACaptureRequest_setEntry_i64(captureRequest.get(), ACAMERA_SENSOR_EXPOSURE_TIME, 1, &exposureTime);
+        }
+
+        cStatus = ACameraCaptureSession_setRepeatingRequest(captureSession.get(), GetCaptureCallback(), 1, &request, nullptr);
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CameraCaptureSession set repeating request failed with error code: %d", cStatus);
+            return false;
+        }
+        return true;
+    }
+
+    void cleanUp() {
+        captureListener.context = nullptr;
+        sessionListener.context = nullptr;
+        if (sessionState == CaptureSessionState::ACTIVE) {
+            ACameraCaptureSession_stopRepeating(captureSession.get());
+        }
+        captureSession = nullptr;
+        if (targetAdded) {
+            ACaptureRequest_removeTarget(captureRequest.get(), outputTarget.get());
+            targetAdded = false;
+        }
+        captureRequest = nullptr;
+        outputTarget = nullptr;
+        if (sessionOutputAdded) {
+            ACaptureSessionOutputContainer_remove(outputContainer.get(), sessionOutput.get());
+            sessionOutputAdded = false;
+        }
+        sessionOutput = nullptr;
+        nativeWindow = nullptr;
+        outputContainer = nullptr;
+        cameraDevice = nullptr;
+        cameraManager = nullptr;
+        imageReader = nullptr;
+    }
+};
+
+/********************************  Session management  *******************************/
+
+void OnSessionClosed(void* context, ACameraCaptureSession* session) {
+    if (context == nullptr) return;
+    LOGW("session %p closed", session);
+    reinterpret_cast<AndroidCameraCapture*>(context)->setSessionState(CaptureSessionState::CLOSED);
+}
+
+void OnSessionReady(void* context, ACameraCaptureSession* session) {
+    if (context == nullptr) return;
+    LOGW("session %p ready", session);
+    reinterpret_cast<AndroidCameraCapture*>(context)->setSessionState(CaptureSessionState::READY);
+}
+
+void OnSessionActive(void* context, ACameraCaptureSession* session) {
+    if (context == nullptr) return;
+    LOGW("session %p active", session);
+    reinterpret_cast<AndroidCameraCapture*>(context)->setSessionState(CaptureSessionState::ACTIVE);
+}
+
+void OnCaptureCompleted(void* context,
+                        ACameraCaptureSession* session,
+                        ACaptureRequest* /* request */,
+                        const ACameraMetadata* /* result */) {
+    if (context == nullptr) return;
+    LOGV("session %p capture completed", session);
+    AndroidCameraCapture* cameraCapture = reinterpret_cast<AndroidCameraCapture*>(context);
+    std::unique_lock<std::mutex> lock(cameraCapture->mtx);
+
+    if (cameraCapture->waitingCapture) {
+        cameraCapture->waitingCapture = false;
+        cameraCapture->captureSuccess = true;
+        cameraCapture->condition.notify_one();
+    }
+}
+
+void OnCaptureFailed(void* context,
+                     ACameraCaptureSession* session,
+                     ACaptureRequest* /* request */,
+                     ACameraCaptureFailure* /* failure */) {
+    if (context == nullptr) return;
+    LOGV("session %p capture failed", session);
+    AndroidCameraCapture* cameraCapture = reinterpret_cast<AndroidCameraCapture*>(context);
+    std::unique_lock<std::mutex> lock(cameraCapture->mtx);
+
+    if (cameraCapture->waitingCapture) {
+        cameraCapture->waitingCapture = false;
+        cameraCapture->captureSuccess = false;
+        cameraCapture->condition.notify_one();
+    }
+}
+
+/****************** Implementation of interface functions ********************/
+
+Ptr<IVideoCapture> cv::createAndroidCapture_cam( int index ) {
+    Ptr<AndroidCameraCapture> res = makePtr<AndroidCameraCapture>();
+    if (res && res->initCapture(index))
+        return res;
+    return Ptr<IVideoCapture>();
+}
diff --git a/modules/videoio/src/cap_avfoundation.mm b/modules/videoio/src/cap_avfoundation.mm
index 19f54be8c3..17c5879014 100644
--- a/modules/videoio/src/cap_avfoundation.mm
+++ b/modules/videoio/src/cap_avfoundation.mm
@@ -95,13 +95,14 @@ class CvCaptureCAM : public CvCapture {
     public:
         CvCaptureCAM(int cameraNum = -1) ;
         ~CvCaptureCAM();
-        virtual bool grabFrame();
-        virtual IplImage* retrieveFrame(int);
-        virtual IplImage* queryFrame();
-        virtual double getProperty(int property_id) const;
-        virtual bool setProperty(int property_id, double value);
-        virtual int didStart();
+        bool grabFrame() CV_OVERRIDE;
+        IplImage* retrieveFrame(int) CV_OVERRIDE;
+        double getProperty(int property_id) const CV_OVERRIDE;
+        bool setProperty(int property_id, double value) CV_OVERRIDE;
+        int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 
+        virtual IplImage* queryFrame();
+        virtual int didStart();
     private:
         AVCaptureSession            *mCaptureSession;
         AVCaptureDeviceInput        *mCaptureDeviceInput;
@@ -137,10 +138,12 @@ class CvCaptureFile : public CvCapture {
 public:
     CvCaptureFile(const char* filename) ;
     ~CvCaptureFile();
-    virtual bool grabFrame();
-    virtual IplImage* retrieveFrame(int);
-    virtual double getProperty(int property_id) const;
-    virtual bool setProperty(int property_id, double value);
+    bool grabFrame() CV_OVERRIDE;
+    IplImage* retrieveFrame(int) CV_OVERRIDE;
+    double getProperty(int property_id) const CV_OVERRIDE;
+    bool setProperty(int property_id, double value) CV_OVERRIDE;
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
+
     virtual int didStart();
 private:
     AVAsset                  *mAsset;
diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm
index ed966ceffa..f33574104f 100644
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@@ -101,12 +101,13 @@ class CvCaptureCAM : public CvCapture {
 public:
     CvCaptureCAM(int cameraNum = -1) ;
     ~CvCaptureCAM();
-    virtual bool grabFrame();
-    virtual IplImage* retrieveFrame(int);
-    virtual double getProperty(int property_id) const;
-    virtual bool setProperty(int property_id, double value);
-    virtual int didStart();
+    bool grabFrame() CV_OVERRIDE;
+    IplImage* retrieveFrame(int) CV_OVERRIDE;
+    double getProperty(int property_id) const CV_OVERRIDE;
+    bool setProperty(int property_id, double value) CV_OVERRIDE;
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 
+    virtual int didStart();
 
 private:
     AVCaptureSession            *mCaptureSession;
@@ -143,12 +144,13 @@ class CvCaptureFile : public CvCapture {
 public:
     CvCaptureFile(const char* filename) ;
     ~CvCaptureFile();
-    virtual bool grabFrame();
-    virtual IplImage* retrieveFrame(int);
-    virtual double getProperty(int property_id) const;
-    virtual bool setProperty(int property_id, double value);
-    virtual int didStart();
+    bool grabFrame() CV_OVERRIDE;
+    IplImage* retrieveFrame(int) CV_OVERRIDE;
+    double getProperty(int property_id) const CV_OVERRIDE;
+    bool setProperty(int property_id, double value) CV_OVERRIDE;
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 
+    virtual int didStart();
 
 private:
     AVAsset                  *mAsset;
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index 3909c60f88..d3abc3694f 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -2741,6 +2741,7 @@ int videoInput::start(int deviceID, videoDevice *VD){
     }
 
     VIDEOINFOHEADER *pVih =  reinterpret_cast<VIDEOINFOHEADER*>(VD->pAmMediaType->pbFormat);
+    CV_Assert(pVih);
     int currentWidth    =  HEADER(pVih)->biWidth;
     int currentHeight    =  HEADER(pVih)->biHeight;
 
@@ -3341,6 +3342,7 @@ VideoCapture_DShow::VideoCapture_DShow(int index)
     , m_fourcc(-1)
     , m_widthSet(-1)
     , m_heightSet(-1)
+    , m_convertRGBSet(true)
 {
     CoInitialize(0);
     open(index);
@@ -3450,6 +3452,7 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
             break;
         g_VI.stopDevice(m_index);
         g_VI.setupDevice(m_index,  cvFloor(propVal));
+        g_VI.setConvertRGB(m_index, m_convertRGBSet);
         break;
 
     case CV_CAP_PROP_FPS:
@@ -3463,6 +3466,7 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
                 g_VI.setupDevice(m_index, m_widthSet, m_heightSet);
             else
                 g_VI.setupDevice(m_index);
+            g_VI.setConvertRGB(m_index, m_convertRGBSet);
         }
         return g_VI.isDeviceSetup(m_index);
     }
@@ -3481,7 +3485,11 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
 
     case CV_CAP_PROP_CONVERT_RGB:
     {
-        return g_VI.setConvertRGB(m_index, cvRound(propVal) == 1);
+        const bool convertRgb = cvRound(propVal) == 1;
+        const bool success = g_VI.setConvertRGB(m_index, convertRgb);
+        if(success)
+            m_convertRGBSet = convertRgb;
+        return success;
     }
 
     }
@@ -3497,6 +3505,7 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
                 g_VI.stopDevice(m_index);
                 g_VI.setIdealFramerate(m_index, fps);
                 g_VI.setupDeviceFourcc(m_index, m_width, m_height, m_fourcc);
+                g_VI.setConvertRGB(m_index, m_convertRGBSet);
             }
 
             bool success = g_VI.isDeviceSetup(m_index);
@@ -3602,6 +3611,7 @@ void VideoCapture_DShow::close()
         m_index = -1;
     }
     m_widthSet = m_heightSet = m_width = m_height = -1;
+    m_convertRGBSet = true;
 }
 
 Ptr<IVideoCapture> create_DShow_capture(int index)
diff --git a/modules/videoio/src/cap_dshow.hpp b/modules/videoio/src/cap_dshow.hpp
index edb694efbb..9a3b9da3bd 100644
--- a/modules/videoio/src/cap_dshow.hpp
+++ b/modules/videoio/src/cap_dshow.hpp
@@ -37,6 +37,7 @@ protected:
 
     int m_index, m_width, m_height, m_fourcc;
     int m_widthSet, m_heightSet;
+    bool m_convertRGBSet;
     static videoInput g_VI;
 };
 
diff --git a/modules/videoio/src/cap_ffmpeg.cpp b/modules/videoio/src/cap_ffmpeg.cpp
index 1c73f6a09c..bd3600e2fd 100644
--- a/modules/videoio/src/cap_ffmpeg.cpp
+++ b/modules/videoio/src/cap_ffmpeg.cpp
@@ -49,7 +49,8 @@
 
 #include "cap_ffmpeg_impl.hpp"
 
-#define icvCreateFileCapture_FFMPEG_p cvCreateFileCapture_FFMPEG
+// TODO drop legacy code
+//#define icvCreateFileCapture_FFMPEG_p cvCreateFileCapture_FFMPEG
 #define icvReleaseCapture_FFMPEG_p cvReleaseCapture_FFMPEG
 #define icvGrabFrame_FFMPEG_p cvGrabFrame_FFMPEG
 #define icvRetrieveFrame_FFMPEG_p cvRetrieveFrame_FFMPEG
@@ -67,7 +68,11 @@ class CvCapture_FFMPEG_proxy CV_FINAL : public cv::IVideoCapture
 {
 public:
     CvCapture_FFMPEG_proxy() { ffmpegCapture = 0; }
-    CvCapture_FFMPEG_proxy(const cv::String& filename) { ffmpegCapture = 0; open(filename); }
+    CvCapture_FFMPEG_proxy(const cv::String& filename, const cv::VideoCaptureParameters& params)
+        : ffmpegCapture(NULL)
+    {
+        open(filename, params);
+    }
     virtual ~CvCapture_FFMPEG_proxy() { close(); }
 
     virtual double getProperty(int propId) const CV_OVERRIDE
@@ -97,14 +102,14 @@ public:
 
         return true;
     }
-    virtual bool open( const cv::String& filename )
+    bool open(const cv::String& filename, const cv::VideoCaptureParameters& params)
     {
         close();
 
-        ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename.c_str() );
+        ffmpegCapture = cvCreateFileCaptureWithParams_FFMPEG(filename.c_str(), params);
         return ffmpegCapture != 0;
     }
-    virtual void close()
+    void close()
     {
         if (ffmpegCapture)
             icvReleaseCapture_FFMPEG_p( &ffmpegCapture );
@@ -145,9 +150,9 @@ protected:
 
 } // namespace
 
-cv::Ptr<cv::IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &filename)
+cv::Ptr<cv::IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &filename, const cv::VideoCaptureParameters& params)
 {
-    cv::Ptr<CvCapture_FFMPEG_proxy> capture = cv::makePtr<CvCapture_FFMPEG_proxy>(filename);
+    cv::Ptr<CvCapture_FFMPEG_proxy> capture = cv::makePtr<CvCapture_FFMPEG_proxy>(filename, params);
     if (capture && capture->isOpened())
         return capture;
     return cv::Ptr<cv::IVideoCapture>();
@@ -160,7 +165,7 @@ class CvVideoWriter_FFMPEG_proxy CV_FINAL :
 {
 public:
     CvVideoWriter_FFMPEG_proxy() { ffmpegWriter = 0; }
-    CvVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor) { ffmpegWriter = 0; open(filename, fourcc, fps, frameSize, isColor); }
+    CvVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc, double fps, cv::Size frameSize, const VideoWriterParameters& params) { ffmpegWriter = 0; open(filename, fourcc, fps, frameSize, params); }
     virtual ~CvVideoWriter_FFMPEG_proxy() { close(); }
 
     int getCaptureDomain() const CV_OVERRIDE { return cv::CAP_FFMPEG; }
@@ -173,10 +178,10 @@ public:
 
         icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image.getMat().ptr(), (int)image.step(), image.cols(), image.rows(), image.channels(), 0);
     }
-    virtual bool open( const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor )
+    virtual bool open( const cv::String& filename, int fourcc, double fps, cv::Size frameSize, const VideoWriterParameters& params )
     {
         close();
-        ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename.c_str(), fourcc, fps, frameSize.width, frameSize.height, isColor );
+        ffmpegWriter = cvCreateVideoWriterWithParams_FFMPEG( filename.c_str(), fourcc, fps, frameSize.width, frameSize.height, params );
         return ffmpegWriter != 0;
     }
 
@@ -188,7 +193,12 @@ public:
         ffmpegWriter = 0;
     }
 
-    virtual double getProperty(int) const CV_OVERRIDE { return 0; }
+    virtual double getProperty(int propId) const CV_OVERRIDE {
+        if(!ffmpegWriter)
+            return 0;
+        return ffmpegWriter->getProperty(propId);
+    }
+
     virtual bool setProperty(int, double) CV_OVERRIDE { return false; }
     virtual bool isOpened() const CV_OVERRIDE { return ffmpegWriter != 0; }
 
@@ -202,8 +212,7 @@ cv::Ptr<cv::IVideoWriter> cvCreateVideoWriter_FFMPEG_proxy(const std::string& fi
                                                            double fps, const cv::Size& frameSize,
                                                            const VideoWriterParameters& params)
 {
-    const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
-    cv::Ptr<CvVideoWriter_FFMPEG_proxy> writer = cv::makePtr<CvVideoWriter_FFMPEG_proxy>(filename, fourcc, fps, frameSize, isColor);
+    cv::Ptr<CvVideoWriter_FFMPEG_proxy> writer = cv::makePtr<CvVideoWriter_FFMPEG_proxy>(filename, fourcc, fps, frameSize, params);
     if (writer && writer->isOpened())
         return writer;
     return cv::Ptr<cv::IVideoWriter>();
@@ -217,7 +226,20 @@ cv::Ptr<cv::IVideoWriter> cvCreateVideoWriter_FFMPEG_proxy(const std::string& fi
 
 #if defined(BUILD_PLUGIN)
 
+#define NEW_PLUGIN
+
+#ifndef NEW_PLUGIN
+#define ABI_VERSION 0
+#define API_VERSION 0
 #include "plugin_api.hpp"
+#else
+#define CAPTURE_ABI_VERSION 1
+#define CAPTURE_API_VERSION 1
+#include "plugin_capture_api.hpp"
+#define WRITER_ABI_VERSION 1
+#define WRITER_API_VERSION 1
+#include "plugin_writer_api.hpp"
+#endif
 
 namespace cv {
 
@@ -233,15 +255,57 @@ CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_
     CvCapture_FFMPEG_proxy *cap = 0;
     try
     {
-        cap = new CvCapture_FFMPEG_proxy(filename);
+        cap = new CvCapture_FFMPEG_proxy(filename, cv::VideoCaptureParameters());
         if (cap->isOpened())
         {
             *handle = (CvPluginCapture)cap;
             return CV_ERROR_OK;
         }
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
+    }
+    if (cap)
+        delete cap;
+    return CV_ERROR_FAIL;
+}
+
+static
+CvResult CV_API_CALL cv_capture_open_with_params(
+        const char* filename, int camera_index,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginCapture* handle
+)
+{
+    if (!handle)
+        return CV_ERROR_FAIL;
+    *handle = NULL;
+    if (!filename)
+        return CV_ERROR_FAIL;
+    CV_UNUSED(camera_index);
+    CvCapture_FFMPEG_proxy *cap = 0;
+    try
+    {
+        cv::VideoCaptureParameters parameters(params, n_params);
+        cap = new CvCapture_FFMPEG_proxy(filename, parameters);
+        if (cap->isOpened())
+        {
+            *handle = (CvPluginCapture)cap;
+            return CV_ERROR_OK;
+        }
+    }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
     }
     if (cap)
         delete cap;
@@ -272,8 +336,14 @@ CvResult CV_API_CALL cv_capture_get_prop(CvPluginCapture handle, int prop, CV_OU
         *val = instance->getProperty(prop);
         return CV_ERROR_OK;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -288,8 +358,14 @@ CvResult CV_API_CALL cv_capture_set_prop(CvPluginCapture handle, int prop, doubl
         CvCapture_FFMPEG_proxy* instance = (CvCapture_FFMPEG_proxy*)handle;
         return instance->setProperty(prop, val) ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -304,12 +380,19 @@ CvResult CV_API_CALL cv_capture_grab(CvPluginCapture handle)
         CvCapture_FFMPEG_proxy* instance = (CvCapture_FFMPEG_proxy*)handle;
         return instance->grabFrame() ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
+#ifndef NEW_PLUGIN
 static
 CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_retrieve_cb_t callback, void* userdata)
 {
@@ -321,38 +404,87 @@ CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx,
         Mat img;
         // TODO: avoid unnecessary copying
         if (instance->retrieveFrame(stream_idx, img))
-            return callback(stream_idx, img.data, img.step, img.cols, img.rows, img.channels(), userdata);
+            return callback(stream_idx, img.data, (int)img.step, img.cols, img.rows, img.channels(), userdata);
         return CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
+#else
+static
+CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_capture_retrieve_cb_t callback, void* userdata)
+{
+    if (!handle)
+        return CV_ERROR_FAIL;
+    try
+    {
+        CvCapture_FFMPEG_proxy* instance = (CvCapture_FFMPEG_proxy*)handle;
+        Mat img;
+        // TODO: avoid unnecessary copying
+        if (instance->retrieveFrame(stream_idx, img))
+            return callback(stream_idx, img.data, (int)img.step, img.cols, img.rows, img.type(), userdata);
+        return CV_ERROR_FAIL;
+    }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
+        return CV_ERROR_FAIL;
+    }
+}
+#endif
 
 static
-CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor,
-                                    CV_OUT CvPluginWriter* handle)
+CvResult CV_API_CALL cv_writer_open_with_params(
+        const char* filename, int fourcc, double fps, int width, int height,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginWriter* handle)
 {
     Size sz(width, height);
     CvVideoWriter_FFMPEG_proxy* wrt = 0;
     try
     {
-        wrt = new CvVideoWriter_FFMPEG_proxy(filename, fourcc, fps, sz, isColor != 0);
+        VideoWriterParameters parameters(params, n_params);
+        wrt = new CvVideoWriter_FFMPEG_proxy(filename, fourcc, fps, sz, parameters);
         if(wrt && wrt->isOpened())
         {
             *handle = (CvPluginWriter)wrt;
             return CV_ERROR_OK;
         }
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
     }
     if (wrt)
         delete wrt;
     return CV_ERROR_FAIL;
 }
 
+static
+CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor,
+    CV_OUT CvPluginWriter* handle)
+{
+    int params[2] = { VIDEOWRITER_PROP_IS_COLOR, isColor };
+    return cv_writer_open_with_params(filename, fourcc, fps, width, height, params, 1, handle);
+}
+
 static
 CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 {
@@ -364,9 +496,22 @@ CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 }
 
 static
-CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter /*handle*/, int /*prop*/, CV_OUT double* /*val*/)
+CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter handle, int prop, CV_OUT double* val)
 {
-    return CV_ERROR_FAIL;
+    if (!handle)
+        return CV_ERROR_FAIL;
+    if (!val)
+        return CV_ERROR_FAIL;
+    try
+    {
+        CvVideoWriter_FFMPEG_proxy* instance = (CvVideoWriter_FFMPEG_proxy*)handle;
+        *val = instance->getProperty(prop);
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
 }
 
 static
@@ -387,42 +532,109 @@ CvResult CV_API_CALL cv_writer_write(CvPluginWriter handle, const unsigned char
         instance->write(img);
         return CV_ERROR_OK;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "FFmpeg: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "FFmpeg: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
-static const OpenCV_VideoIO_Plugin_API_preview plugin_api_v0 =
+} // namespace
+
+#ifndef NEW_PLUGIN
+
+static const OpenCV_VideoIO_Plugin_API_preview plugin_api =
 {
     {
         sizeof(OpenCV_VideoIO_Plugin_API_preview), ABI_VERSION, API_VERSION,
         CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
         "FFmpeg OpenCV Video I/O plugin"
     },
-    /*  1*/CAP_FFMPEG,
-    /*  2*/cv_capture_open,
-    /*  3*/cv_capture_release,
-    /*  4*/cv_capture_get_prop,
-    /*  5*/cv_capture_set_prop,
-    /*  6*/cv_capture_grab,
-    /*  7*/cv_capture_retrieve,
-    /*  8*/cv_writer_open,
-    /*  9*/cv_writer_release,
-    /* 10*/cv_writer_get_prop,
-    /* 11*/cv_writer_set_prop,
-    /* 12*/cv_writer_write
+    {
+        /*  1*/CAP_FFMPEG,
+        /*  2*/cv_capture_open,
+        /*  3*/cv_capture_release,
+        /*  4*/cv_capture_get_prop,
+        /*  5*/cv_capture_set_prop,
+        /*  6*/cv_capture_grab,
+        /*  7*/cv_capture_retrieve,
+        /*  8*/cv_writer_open,
+        /*  9*/cv_writer_release,
+        /* 10*/cv_writer_get_prop,
+        /* 11*/cv_writer_set_prop,
+        /* 12*/cv_writer_write
+    }
 };
 
-} // namespace
-
 const OpenCV_VideoIO_Plugin_API_preview* opencv_videoio_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
 {
-    if (requested_abi_version != 0)
-        return NULL;
-    if (requested_api_version != 0)
-        return NULL;
-    return &cv::plugin_api_v0;
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
 }
 
+#else  // NEW_PLUGIN
+
+static const OpenCV_VideoIO_Capture_Plugin_API capture_plugin_api =
+{
+    {
+        sizeof(OpenCV_VideoIO_Capture_Plugin_API), CAPTURE_ABI_VERSION, CAPTURE_API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "FFmpeg OpenCV Video I/O Capture plugin"
+    },
+    {
+        /*  1*/CAP_FFMPEG,
+        /*  2*/cv_capture_open,
+        /*  3*/cv_capture_release,
+        /*  4*/cv_capture_get_prop,
+        /*  5*/cv_capture_set_prop,
+        /*  6*/cv_capture_grab,
+        /*  7*/cv_capture_retrieve,
+    },
+    {
+        /*  8*/cv_capture_open_with_params,
+    }
+};
+
+const OpenCV_VideoIO_Capture_Plugin_API* opencv_videoio_capture_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == CAPTURE_ABI_VERSION && requested_api_version <= CAPTURE_API_VERSION)
+        return &capture_plugin_api;
+    return NULL;
+}
+
+static const OpenCV_VideoIO_Writer_Plugin_API writer_plugin_api =
+{
+    {
+        sizeof(OpenCV_VideoIO_Writer_Plugin_API), WRITER_ABI_VERSION, WRITER_API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "FFmpeg OpenCV Video I/O Writer plugin"
+    },
+    {
+        /*  1*/CAP_FFMPEG,
+        /*  2*/cv_writer_open,
+        /*  3*/cv_writer_release,
+        /*  4*/cv_writer_get_prop,
+        /*  5*/cv_writer_set_prop,
+        /*  6*/cv_writer_write
+    },
+    {
+        /*  7*/cv_writer_open_with_params
+    }
+};
+
+const OpenCV_VideoIO_Writer_Plugin_API* opencv_videoio_writer_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == WRITER_ABI_VERSION && requested_api_version <= WRITER_API_VERSION)
+        return &writer_plugin_api;
+    return NULL;
+}
+
+#endif  // NEW_PLUGIN
+
 #endif // BUILD_PLUGIN
diff --git a/modules/videoio/src/cap_ffmpeg_hw.hpp b/modules/videoio/src/cap_ffmpeg_hw.hpp
new file mode 100644
index 0000000000..6e4f71fd3d
--- /dev/null
+++ b/modules/videoio/src/cap_ffmpeg_hw.hpp
@@ -0,0 +1,555 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020-2021 Intel Corporation
+
+#include "opencv2/videoio.hpp"
+#if defined(__OPENCV_BUILD) || defined(OPENCV_HAVE_CVCONFIG_H)  // TODO Properly detect and add D3D11 / LIBVA dependencies for standalone plugins
+#include "cvconfig.h"
+#endif
+#include <sstream>
+
+#ifdef HAVE_D3D11
+#define D3D11_NO_HELPERS
+#include <d3d11.h>
+#include <codecvt>
+#endif
+
+#ifdef HAVE_VA
+#include <va/va_backend.h>
+#endif
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+
+#include <libavutil/hwcontext.h>
+#ifdef HAVE_D3D11
+#include <libavutil/hwcontext_d3d11va.h>
+#endif
+#ifdef HAVE_VA
+#include <libavutil/hwcontext_vaapi.h>
+#endif
+}
+
+static
+const char* getVideoAccelerationName(VideoAccelerationType va_type)
+{
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: return "none";
+    case VIDEO_ACCELERATION_ANY: return "any";
+    case VIDEO_ACCELERATION_D3D11: return "d3d11";
+    case VIDEO_ACCELERATION_VAAPI: return "vaapi";
+    case VIDEO_ACCELERATION_MFX: return "mfx";
+    }
+    return "unknown";
+}
+
+static
+std::string getDecoderConfiguration(VideoAccelerationType va_type, AVDictionary *dict)
+{
+    std::string va_name = getVideoAccelerationName(va_type);
+    std::string key_name = std::string("hw_decoders_") + va_name;
+    const char *hw_acceleration = NULL;
+    if (dict)
+    {
+        AVDictionaryEntry* entry = av_dict_get(dict, key_name.c_str(), NULL, 0);
+        if (entry)
+            hw_acceleration = entry->value;
+    }
+    if (hw_acceleration)
+        return hw_acceleration;
+
+    // some default values (FFMPEG_DECODE_ACCELERATION_TYPES)
+#ifdef _WIN32
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: return "";
+    case VIDEO_ACCELERATION_ANY: return "d3d11va";
+    case VIDEO_ACCELERATION_D3D11: return "d3d11va";
+    case VIDEO_ACCELERATION_VAAPI: return "";
+    case VIDEO_ACCELERATION_MFX: return "";
+    }
+    return "";
+#else
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: return "";
+    case VIDEO_ACCELERATION_ANY: return "vaapi.iHD";
+    case VIDEO_ACCELERATION_D3D11: return "";
+    case VIDEO_ACCELERATION_VAAPI: return "vaapi.iHD";
+    case VIDEO_ACCELERATION_MFX: return "";
+    }
+    return "";
+#endif
+}
+
+static
+std::string getEncoderConfiguration(VideoAccelerationType va_type, AVDictionary *dict)
+{
+    std::string va_name = getVideoAccelerationName(va_type);
+    std::string key_name = std::string("hw_encoders_") + va_name;
+    const char *hw_acceleration = NULL;
+    if (dict)
+    {
+        AVDictionaryEntry* entry = av_dict_get(dict, key_name.c_str(), NULL, 0);
+        if (entry)
+            hw_acceleration = entry->value;
+    }
+    if (hw_acceleration)
+        return hw_acceleration;
+
+    // some default values (FFMPEG_ENCODE_ACCELERATION_TYPES)
+#ifdef _WIN32
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: return "";
+    case VIDEO_ACCELERATION_ANY: return "qsv";
+    case VIDEO_ACCELERATION_D3D11: return "";
+    case VIDEO_ACCELERATION_VAAPI: return "";
+    case VIDEO_ACCELERATION_MFX: return "qsv";
+    }
+    return "";
+#else
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: return "";
+    case VIDEO_ACCELERATION_ANY: return "qsv.iHD,vaapi.iHD";
+    case VIDEO_ACCELERATION_D3D11: return "";
+    case VIDEO_ACCELERATION_VAAPI: return "vaapi.iHD";
+    case VIDEO_ACCELERATION_MFX: return "qsv.iHD";
+    }
+    return "unknown";
+#endif
+}
+
+
+static
+std::string getDecoderDisabledCodecs(AVDictionary *dict)
+{
+    std::string key_name = std::string("hw_disable_decoders");
+    const char *disabled_codecs = NULL;
+    if (dict)
+    {
+        AVDictionaryEntry* entry = av_dict_get(dict, key_name.c_str(), NULL, 0);
+        if (entry)
+            disabled_codecs = entry->value;
+    }
+    if (disabled_codecs)
+        return disabled_codecs;
+
+    // some default values (FFMPEG_DECODE_DISABLE_CODECS)
+#ifdef _WIN32
+    return "none";
+#else
+    return "av1.vaapi,av1_qsv,vp8.vaapi,vp8_qsv";  // "vp9_qsv"
+#endif
+}
+
+static
+std::string getEncoderDisabledCodecs(AVDictionary *dict)
+{
+    std::string key_name = std::string("hw_disabled_encoders");
+    const char *disabled_codecs = NULL;
+    if (dict)
+    {
+        AVDictionaryEntry* entry = av_dict_get(dict, key_name.c_str(), NULL, 0);
+        if (entry)
+            disabled_codecs = entry->value;
+    }
+    if (disabled_codecs)
+        return disabled_codecs;
+
+    // some default values (FFMPEG_ENCODE_DISABLE_CODECS)
+#ifdef _WIN32
+    return "mjpeg_qsv";
+#else
+    return "mjpeg_vaapi,mjpeg_qsv,vp8_vaapi";
+#endif
+}
+
+
+#define HW_DEFAULT_POOL_SIZE    32
+#define HW_DEFAULT_SW_FORMAT    AV_PIX_FMT_NV12
+
+using namespace cv;
+
+static AVCodec *hw_find_codec(AVCodecID id, AVHWDeviceType hw_type, int (*check_category)(const AVCodec *),
+                              const char *disabled_codecs, AVPixelFormat *hw_pix_fmt);
+static AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname);
+static AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format);
+static AVPixelFormat hw_get_format_callback(struct AVCodecContext *ctx, const enum AVPixelFormat * fmt);
+static VideoAccelerationType hw_type_to_va_type(AVHWDeviceType hw_type);
+
+static
+bool hw_check_device(AVBufferRef* ctx, AVHWDeviceType hw_type, const std::string& device_subname) {
+    if (!ctx)
+        return false;
+    AVHWDeviceContext* hw_device_ctx = (AVHWDeviceContext*)ctx->data;
+    if (!hw_device_ctx->hwctx)
+        return false;
+    const char *hw_name = av_hwdevice_get_type_name(hw_type);
+    if (hw_type == AV_HWDEVICE_TYPE_QSV)
+        hw_name = "MFX";
+    bool ret = true;
+    std::string device_name;
+#if defined(HAVE_D3D11)
+    if (hw_device_ctx->type == AV_HWDEVICE_TYPE_D3D11VA) {
+        ID3D11Device* device = ((AVD3D11VADeviceContext*)hw_device_ctx->hwctx)->device;
+        IDXGIDevice* dxgiDevice = nullptr;
+        if (device && SUCCEEDED(device->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&dxgiDevice)))) {
+            IDXGIAdapter* adapter = nullptr;
+            if (SUCCEEDED(dxgiDevice->GetAdapter(&adapter))) {
+                DXGI_ADAPTER_DESC desc;
+                if (SUCCEEDED(adapter->GetDesc(&desc))) {
+                    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
+                    device_name = conv.to_bytes(desc.Description);
+                }
+                adapter->Release();
+            }
+            dxgiDevice->Release();
+        }
+    }
+#endif
+    if (hw_device_ctx->type == AV_HWDEVICE_TYPE_VAAPI) {
+#if defined(HAVE_VA) && (VA_MAJOR_VERSION >= 1)
+        VADisplay display = ((AVVAAPIDeviceContext *) hw_device_ctx->hwctx)->display;
+        if (display) {
+            VADriverContext *va_ctx = ((VADisplayContext *) display)->pDriverContext;
+            device_name = va_ctx->str_vendor;
+            if (hw_type == AV_HWDEVICE_TYPE_QSV) {
+                // Workaround for issue fixed in MediaSDK 21.x https://github.com/Intel-Media-SDK/MediaSDK/issues/2595
+                // Checks VAAPI driver for support of VideoProc operation required by MediaSDK
+                ret = false;
+                int n_entrypoints = va_ctx->max_entrypoints;
+                std::vector<VAEntrypoint> entrypoints(n_entrypoints);
+                if (va_ctx->vtable->vaQueryConfigEntrypoints(va_ctx, VAProfileNone, entrypoints.data(), &n_entrypoints) == VA_STATUS_SUCCESS) {
+                    for (int i = 0; i < n_entrypoints; i++) {
+                        if (entrypoints[i] == VAEntrypointVideoProc) {
+                            ret = true;
+                            break;
+                        }
+                    }
+                }
+                if (!ret)
+                    CV_LOG_INFO(NULL, "FFMPEG: Skipping MFX video acceleration as entrypoint VideoProc not found in: " << device_name);
+            }
+        }
+#else
+        ret = (hw_type != AV_HWDEVICE_TYPE_QSV); // disable MFX if we can't check VAAPI for VideoProc entrypoint
+#endif
+    }
+    if (ret && !device_subname.empty() && device_name.find(device_subname) == std::string::npos)
+    {
+        CV_LOG_INFO(NULL, "FFMPEG: Skipping '" << hw_name <<
+            "' video acceleration on the following device name as not matching substring '" << device_subname << "': " << device_name);
+        ret = false;  // reject configuration
+    }
+    if (ret)
+    {
+        if (!device_name.empty()) {
+            CV_LOG_INFO(NULL, "FFMPEG: Using " << hw_name << " video acceleration on device: " << device_name);
+        } else {
+            CV_LOG_INFO(NULL, "FFMPEG: Using " << hw_name << " video acceleration");
+        }
+    }
+    return ret;
+}
+
+static
+AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname) {
+    if (AV_HWDEVICE_TYPE_NONE == hw_type)
+        return NULL;
+
+    AVHWDeviceType child_type = hw_type;
+    if (hw_type == AV_HWDEVICE_TYPE_QSV) {
+#ifdef _WIN32
+        child_type = AV_HWDEVICE_TYPE_DXVA2;
+#else
+        child_type = AV_HWDEVICE_TYPE_VAAPI;
+#endif
+    }
+
+    AVBufferRef* hw_device_ctx = NULL;
+    char device[128] = "";
+    char* pdevice = NULL;
+    if (hw_device >= 0 && hw_device < 100000) {
+        if (child_type == AV_HWDEVICE_TYPE_VAAPI) {
+            snprintf(device, sizeof(device), "/dev/dri/renderD%d", 128 + hw_device);
+        } else {
+            snprintf(device, sizeof(device), "%d", hw_device);
+        }
+        pdevice = device;
+    }
+    const char *hw_child_name = av_hwdevice_get_type_name(child_type);
+    const char *device_name = pdevice ? pdevice : "'default'";
+    int err = av_hwdevice_ctx_create(&hw_device_ctx, child_type, pdevice, NULL, 0);
+    if (hw_device_ctx && err >= 0)
+    {
+        CV_LOG_DEBUG(NULL, "FFMPEG: Created video acceleration context (av_hwdevice_ctx_create) for " << hw_child_name << " on device " << device_name);
+        if (!hw_check_device(hw_device_ctx, hw_type, device_subname)) {
+            av_buffer_unref(&hw_device_ctx);
+            return NULL;
+        }
+        if (hw_type != child_type) {
+            AVBufferRef *derived_ctx = NULL;
+            const char *hw_name = av_hwdevice_get_type_name(hw_type);
+            err = av_hwdevice_ctx_create_derived(&derived_ctx, hw_type, hw_device_ctx, 0);
+            if (!derived_ctx || err < 0)
+            {
+                if (derived_ctx)
+                    av_buffer_unref(&derived_ctx);
+                CV_LOG_INFO(NULL, "FFMPEG: Failed to create derived video acceleration (av_hwdevice_ctx_create_derived) for " << hw_name << ". Error=" << err);
+            }
+            else
+            {
+                CV_LOG_DEBUG(NULL, "FFMPEG: Created derived video acceleration context (av_hwdevice_ctx_create_derived) for " << hw_name);
+            }
+            av_buffer_unref(&hw_device_ctx);
+            return derived_ctx;
+        } else {
+            return hw_device_ctx;
+        }
+    }
+    else
+    {
+        const char *hw_name = hw_child_name;
+        CV_LOG_INFO(NULL, "FFMPEG: Failed to create " << hw_name << " video acceleration (av_hwdevice_ctx_create) on device " << device_name);
+        return NULL;
+    }
+}
+
+static
+AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format)
+{
+    AVBufferRef *hw_frames_ref = nullptr;
+    if (ctx)
+    {
+        int res = avcodec_get_hw_frames_parameters(ctx, hw_device_ctx, hw_format, &hw_frames_ref);
+        if (res < 0)
+        {
+            CV_LOG_DEBUG(NULL, "FFMPEG: avcodec_get_hw_frames_parameters() call failed: " << res)
+        }
+    }
+    if (!hw_frames_ref)
+    {
+        hw_frames_ref = av_hwframe_ctx_alloc(hw_device_ctx);
+    }
+    if (!hw_frames_ref)
+    {
+        CV_LOG_INFO(NULL, "FFMPEG: Failed to create HW frame context (av_hwframe_ctx_alloc)");
+        return NULL;
+    }
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext *)(hw_frames_ref->data);
+    frames_ctx->width = width;
+    frames_ctx->height = height;
+    if (frames_ctx->format == AV_PIX_FMT_NONE)
+        frames_ctx->format = hw_format;
+    if (frames_ctx->sw_format == AV_PIX_FMT_NONE)
+        frames_ctx->sw_format = HW_DEFAULT_SW_FORMAT;
+    if (frames_ctx->initial_pool_size == 0)
+        frames_ctx->initial_pool_size = HW_DEFAULT_POOL_SIZE;
+    int res = av_hwframe_ctx_init(hw_frames_ref);
+    if (res < 0)
+    {
+        CV_LOG_INFO(NULL, "FFMPEG: Failed to initialize HW frame context (av_hwframe_ctx_init): " << res);
+        av_buffer_unref(&hw_frames_ref);
+        return NULL;
+    }
+    return hw_frames_ref;
+}
+
+static
+bool hw_check_codec(AVCodec* codec, AVHWDeviceType hw_type, const char *disabled_codecs)
+{
+    CV_Assert(disabled_codecs);
+    std::string hw_name = std::string(".") + av_hwdevice_get_type_name(hw_type);
+    std::stringstream s_stream(disabled_codecs);
+    while (s_stream.good()) {
+        std::string name;
+        getline(s_stream, name, ',');
+        if (name == codec->name || name == hw_name || name == codec->name + hw_name || name == "hw") {
+            CV_LOG_INFO(NULL, "FFMPEG: skipping codec " << codec->name << hw_name);
+            return false;
+        }
+    }
+    return true;
+}
+
+static
+AVCodec *hw_find_codec(AVCodecID id, AVHWDeviceType hw_type, int (*check_category)(const AVCodec *), const char *disabled_codecs, AVPixelFormat *hw_pix_fmt) {
+    AVCodec *c = 0;
+    void *opaque = 0;
+
+    while (NULL != (c = (AVCodec*)av_codec_iterate(&opaque)))
+    {
+        if (!check_category(c))
+            continue;
+        if (c->id != id)
+            continue;
+        if (c->capabilities & AV_CODEC_CAP_EXPERIMENTAL)
+            continue;
+        if (hw_type != AV_HWDEVICE_TYPE_NONE) {
+            AVPixelFormat hw_native_fmt = AV_PIX_FMT_NONE;
+#if LIBAVUTIL_BUILD < AV_VERSION_INT(56, 51, 100) // VAAPI encoders support avcodec_get_hw_config() starting ffmpeg 4.3
+            if (hw_type == AV_HWDEVICE_TYPE_VAAPI)
+                hw_native_fmt = AV_PIX_FMT_VAAPI_VLD;
+#endif
+            if (hw_type == AV_HWDEVICE_TYPE_CUDA) // CUDA encoders don't support avcodec_get_hw_config()
+                hw_native_fmt = AV_PIX_FMT_CUDA;
+            if (av_codec_is_encoder(c) && hw_native_fmt != AV_PIX_FMT_NONE && c->pix_fmts) {
+                for (int i = 0; c->pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
+                    if (c->pix_fmts[i] == hw_native_fmt) {
+                        *hw_pix_fmt = hw_native_fmt;
+                        if (hw_check_codec(c, hw_type, disabled_codecs))
+                            return c;
+                    }
+                }
+            }
+            for (int i = 0;; i++) {
+                const AVCodecHWConfig *hw_config = avcodec_get_hw_config(c, i);
+                if (!hw_config)
+                    break;
+                if (hw_config->device_type == hw_type) {
+                    *hw_pix_fmt = hw_config->pix_fmt;
+                    if (hw_check_codec(c, hw_type, disabled_codecs))
+                        return c;
+                }
+            }
+        } else {
+            return c;
+        }
+    }
+
+    return NULL;
+}
+
+// Callback to select hardware pixel format (not software format) and allocate frame pool (hw_frames_ctx)
+static
+AVPixelFormat hw_get_format_callback(struct AVCodecContext *ctx, const enum AVPixelFormat * fmt) {
+    if (!ctx->hw_device_ctx)
+        return fmt[0];
+    AVHWDeviceType hw_type = ((AVHWDeviceContext*)ctx->hw_device_ctx->data)->type;
+    for (int j = 0;; j++) {
+        const AVCodecHWConfig *hw_config = avcodec_get_hw_config(ctx->codec, j);
+        if (!hw_config)
+            break;
+        if (hw_config->device_type == hw_type) {
+            for (int i = 0; fmt[i] != AV_PIX_FMT_NONE; i++) {
+                if (fmt[i] == hw_config->pix_fmt) {
+                    if (hw_config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX) {
+                        ctx->sw_pix_fmt = HW_DEFAULT_SW_FORMAT;
+                        ctx->hw_frames_ctx = hw_create_frames(ctx, ctx->hw_device_ctx, ctx->width, ctx->height, fmt[i]);
+                        if (ctx->hw_frames_ctx) {
+                            //ctx->sw_pix_fmt = ((AVHWFramesContext *)(ctx->hw_frames_ctx->data))->sw_format;
+                            return fmt[i];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    CV_LOG_DEBUG(NULL, "FFMPEG: Can't select HW format in 'get_format()' callback, use default");
+    return fmt[0];
+}
+
+static
+VideoAccelerationType hw_type_to_va_type(AVHWDeviceType hw_type) {
+    struct HWTypeFFMPEG {
+        AVHWDeviceType hw_type;
+        VideoAccelerationType va_type;
+    } known_hw_types[] = {
+            { AV_HWDEVICE_TYPE_D3D11VA, VIDEO_ACCELERATION_D3D11 },
+            { AV_HWDEVICE_TYPE_VAAPI, VIDEO_ACCELERATION_VAAPI },
+            { AV_HWDEVICE_TYPE_QSV, VIDEO_ACCELERATION_MFX },
+            { AV_HWDEVICE_TYPE_CUDA, (VideoAccelerationType)(1 << 11) },
+    };
+    for (const HWTypeFFMPEG& hw : known_hw_types) {
+        if (hw_type == hw.hw_type)
+            return hw.va_type;
+    }
+    return VIDEO_ACCELERATION_NONE;
+}
+
+class HWAccelIterator {
+public:
+    HWAccelIterator(VideoAccelerationType va_type, bool isEncoder, AVDictionary *dict)
+        : hw_type_(AV_HWDEVICE_TYPE_NONE)
+    {
+        std::string accel_list;
+        if (va_type != VIDEO_ACCELERATION_NONE)
+        {
+            updateAccelList_(accel_list, va_type, isEncoder, dict);
+        }
+        if (va_type == VIDEO_ACCELERATION_ANY)
+        {
+            if (!accel_list.empty())
+                accel_list += ",";  // add no-acceleration case to the end of the list
+        }
+        CV_LOG_DEBUG(NULL, "FFMPEG: allowed acceleration types (" << getVideoAccelerationName(va_type) << "): '" << accel_list << "'");
+
+        if (accel_list.empty() && va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        {
+            // broke stream
+            std::string tmp;
+            s_stream_ >> tmp;
+        }
+        else
+        {
+            s_stream_ = std::istringstream(accel_list);
+        }
+
+        if (va_type != VIDEO_ACCELERATION_NONE)
+        {
+            disabled_codecs_ = isEncoder
+                    ? getEncoderDisabledCodecs(dict)
+                    : getDecoderDisabledCodecs(dict);
+            CV_LOG_DEBUG(NULL, "FFMPEG: disabled codecs: '" << disabled_codecs_ << "'");
+        }
+    }
+    bool good() const
+    {
+        return s_stream_.good();
+    }
+    void parse_next()
+    {
+        getline(s_stream_, hw_type_device_string_, ',');
+        size_t index = hw_type_device_string_.find('.');
+        if (index != std::string::npos) {
+            device_subname_ = hw_type_device_string_.substr(index + 1);
+            hw_type_string_ = hw_type_device_string_.substr(0, index);
+        } else {
+            device_subname_.clear();
+            hw_type_string_ = hw_type_device_string_;
+        }
+        hw_type_ = av_hwdevice_find_type_by_name(hw_type_string_.c_str());
+    }
+    const std::string& hw_type_device_string() const { return hw_type_device_string_; }
+    const std::string& hw_type_string() const { return hw_type_string_; }
+    AVHWDeviceType hw_type() const { return hw_type_; }
+    const std::string& device_subname() const { return device_subname_; }
+    const std::string& disabled_codecs() const { return disabled_codecs_; }
+private:
+    bool updateAccelList_(std::string& accel_list, VideoAccelerationType va_type, bool isEncoder, AVDictionary *dict)
+    {
+        std::string new_accels = isEncoder
+                ? getEncoderConfiguration(va_type, dict)
+                : getDecoderConfiguration(va_type, dict);
+        if (new_accels.empty())
+            return false;
+        if (accel_list.empty())
+            accel_list = new_accels;
+        else
+            accel_list = accel_list + "," + new_accels;
+        return true;
+    }
+    std::istringstream s_stream_;
+    std::string hw_type_device_string_;
+    std::string hw_type_string_;
+    AVHWDeviceType hw_type_;
+    std::string device_subname_;
+
+    std::string disabled_codecs_;
+};
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index f806b269ae..84e4e722f7 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -64,6 +64,9 @@ using namespace cv;
 #ifdef __GNUC__
 #  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
+#ifdef _MSC_VER
+#pragma warning(disable: 4996)  // was declared deprecated
+#endif
 
 #ifndef CV_UNUSED  // Required for standalone compilation mode (OpenCV defines this in base.hpp)
 #define CV_UNUSED(name) (void)name
@@ -90,6 +93,16 @@ extern "C" {
 }
 #endif
 
+//#define USE_AV_HW_CODECS 0
+#ifndef USE_AV_HW_CODECS
+#if LIBAVUTIL_VERSION_MAJOR >= 56 // FFMPEG 4.0+
+#define USE_AV_HW_CODECS 1
+#include "cap_ffmpeg_hw.hpp"
+#else
+#define USE_AV_HW_CODECS 0
+#endif
+#endif
+
 #if defined _MSC_VER && _MSC_VER >= 1200
 #pragma warning( default: 4244 4510 4610 )
 #endif
@@ -237,7 +250,7 @@ inline void get_monotonic_time(timespec *tv)
 
     t.QuadPart -= offset.QuadPart;
     microseconds = (double)t.QuadPart / frequencyToMicroseconds;
-    t.QuadPart = microseconds;
+    t.QuadPart = (LONGLONG)microseconds;
     tv->tv_sec = t.QuadPart / 1000000;
     tv->tv_nsec = (t.QuadPart % 1000000) * 1000;
 }
@@ -456,7 +469,7 @@ static AVRational _opencv_ffmpeg_get_sample_aspect_ratio(AVStream *stream)
 
 struct CvCapture_FFMPEG
 {
-    bool open( const char* filename );
+    bool open(const char* filename, const VideoCaptureParameters& params);
     void close();
 
     double getProperty(int) const;
@@ -522,6 +535,8 @@ struct CvCapture_FFMPEG
  #else
     AVBitStreamFilterContext* bsfc;
 #endif
+    VideoAccelerationType va_type;
+    int hw_device;
 };
 
 void CvCapture_FFMPEG::init()
@@ -557,6 +572,8 @@ void CvCapture_FFMPEG::init()
     memset(&packet_filtered, 0, sizeof(packet_filtered));
     av_init_packet(&packet_filtered);
     bsfc = NULL;
+    va_type = cv::VIDEO_ACCELERATION_NONE;  // TODO OpenCV 5.0: change to _ANY?
+    hw_device = -1;
 }
 
 
@@ -686,16 +703,6 @@ struct ImplMutex::Impl
     int refcount;
 };
 
-#ifndef __GNUC__
-static int _interlockedExchangeAdd(int* addr, int delta)
-{
-#if defined _MSC_VER && _MSC_VER >= 1500
-    return (int)_InterlockedExchangeAdd((long volatile*)addr, delta);
-#else
-    return (int)InterlockedExchangeAdd((long volatile*)addr, delta);
-#endif
-}
-#endif // __GNUC__
 
 #elif defined __APPLE__
 
@@ -827,17 +834,10 @@ public:
     {
         AutoLock lock(_mutex);
         static InternalFFMpegRegister instance;
+        initLogger_();  // update logger setup unconditionally (GStreamer's libav plugin may override these settings)
     }
-    InternalFFMpegRegister()
+    static void initLogger_()
     {
-        avformat_network_init();
-
-        /* register all codecs, demux and protocols */
-        av_register_all();
-
-        /* register a callback function for synchronization */
-        av_lockmgr_register(&LockCallBack);
-
 #ifndef NO_GETENV
         char* debug_option = getenv("OPENCV_FFMPEG_DEBUG");
         if (debug_option != NULL)
@@ -851,21 +851,84 @@ public:
             av_log_set_level(AV_LOG_ERROR);
         }
     }
+
+public:
+    InternalFFMpegRegister()
+    {
+        avformat_network_init();
+
+        /* register all codecs, demux and protocols */
+        av_register_all();
+
+        /* register a callback function for synchronization */
+        av_lockmgr_register(&LockCallBack);
+    }
     ~InternalFFMpegRegister()
     {
         av_lockmgr_register(NULL);
+        av_log_set_callback(NULL);
     }
 };
 
-bool CvCapture_FFMPEG::open( const char* _filename )
+bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters& params)
 {
     InternalFFMpegRegister::init();
+
     AutoLock lock(_mutex);
+
     unsigned i;
     bool valid = false;
 
     close();
 
+    if (!params.empty())
+    {
+        if (params.has(CAP_PROP_FORMAT))
+        {
+            int value = params.get<int>(CAP_PROP_FORMAT);
+            if (value == -1)
+            {
+                CV_LOG_INFO(NULL, "VIDEOIO/FFMPEG: enabled demuxer only mode: '" << (_filename ? _filename : "<NULL>") << "'");
+                rawMode = true;
+            }
+            else
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: CAP_PROP_FORMAT parameter value is invalid/unsupported: " << value);
+                return false;
+            }
+        }
+        if (params.has(CAP_PROP_HW_ACCELERATION))
+        {
+            va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
+#if !USE_AV_HW_CODECS
+            if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter. Bailout");
+                return false;
+            }
+#endif
+        }
+        if (params.has(CAP_PROP_HW_DEVICE))
+        {
+            hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
+            if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+                return false;
+            }
+            if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+                return false;
+            }
+        }
+        if (params.warnUnusedParameters())
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: unsupported parameters in .open(), see logger INFO channel for details. Bailout");
+            return false;
+        }
+    }
+
 #if USE_AV_INTERRUPT_CALLBACK
     /* interrupt callback */
     interrupt_metadata.timeout_after_ms = LIBAVFORMAT_INTERRUPT_OPEN_TIMEOUT_MS;
@@ -950,22 +1013,102 @@ bool CvCapture_FFMPEG::open( const char* _filename )
 
         if( AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream < 0)
         {
+            CV_LOG_DEBUG(NULL, "FFMPEG: stream[" << i << "] is video stream with codecID=" << (int)enc->codec_id
+                    << " width=" << enc->width
+                    << " height=" << enc->height
+            );
+
             // backup encoder' width/height
             int enc_width = enc->width;
             int enc_height = enc->height;
 
-            AVCodec *codec;
-            if(av_dict_get(dict, "video_codec", NULL, 0) == NULL) {
-                codec = avcodec_find_decoder(enc->codec_id);
-            } else {
-                codec = avcodec_find_decoder_by_name(av_dict_get(dict, "video_codec", NULL, 0)->value);
-            }
-            if (!codec || avcodec_open2(enc, codec, NULL) < 0)
+#if !USE_AV_HW_CODECS
+            va_type = VIDEO_ACCELERATION_NONE;
+#endif
+
+            // find and open decoder, try HW acceleration types specified in 'hw_acceleration' list (in order)
+            AVCodec *codec = NULL;
+            err = -1;
+#if USE_AV_HW_CODECS
+            HWAccelIterator accel_iter(va_type, false/*isEncoder*/, dict);
+            while (accel_iter.good())
+            {
+#else
+            do {
+#endif
+#if USE_AV_HW_CODECS
+                accel_iter.parse_next();
+                AVHWDeviceType hw_type = accel_iter.hw_type();
+                enc->get_format = avcodec_default_get_format;
+                if (enc->hw_device_ctx) {
+                    av_buffer_unref(&enc->hw_device_ctx);
+                }
+                if (hw_type != AV_HWDEVICE_TYPE_NONE)
+                {
+                    CV_LOG_DEBUG(NULL, "FFMPEG: trying to configure H/W acceleration: '" << accel_iter.hw_type_device_string() << "'");
+                    AVPixelFormat hw_pix_fmt = AV_PIX_FMT_NONE;
+                    codec = hw_find_codec(enc->codec_id, hw_type, av_codec_is_decoder, accel_iter.disabled_codecs().c_str(), &hw_pix_fmt);
+                    if (codec) {
+                        if (hw_pix_fmt != AV_PIX_FMT_NONE)
+                            enc->get_format = hw_get_format_callback; // set callback to select HW pixel format, not SW format
+                        enc->hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname());
+                        if (!enc->hw_device_ctx)
+                        {
+                            CV_LOG_DEBUG(NULL, "FFMPEG: ... can't create H/W device: '" << accel_iter.hw_type_device_string() << "'");
+                            codec = NULL;
+                        }
+                    }
+                }
+                else if (hw_type == AV_HWDEVICE_TYPE_NONE)
+#endif // USE_AV_HW_CODECS
+                {
+                    AVDictionaryEntry* video_codec_param = av_dict_get(dict, "video_codec", NULL, 0);
+                    if (video_codec_param == NULL)
+                    {
+                        codec = avcodec_find_decoder(enc->codec_id);
+                        if (!codec)
+                        {
+                            CV_LOG_ERROR(NULL, "Could not find decoder for codec_id=" << (int)enc->codec_id);
+                        }
+                    }
+                    else
+                    {
+                        CV_LOG_DEBUG(NULL, "FFMPEG: Using video_codec='" << video_codec_param->value << "'");
+                        codec = avcodec_find_decoder_by_name(video_codec_param->value);
+                        if (!codec)
+                        {
+                            CV_LOG_ERROR(NULL, "Could not find decoder '" << video_codec_param->value << "'");
+                        }
+                    }
+                }
+                if (!codec)
+                    continue;
+                err = avcodec_open2(enc, codec, NULL);
+                if (err >= 0) {
+#if USE_AV_HW_CODECS
+                    va_type = hw_type_to_va_type(hw_type);
+                    if (hw_type != AV_HWDEVICE_TYPE_NONE && hw_device < 0)
+                        hw_device = 0;
+#endif
+                    break;
+                } else {
+                    CV_LOG_ERROR(NULL, "Could not open codec " << codec->name << ", error: " << err);
+                }
+#if USE_AV_HW_CODECS
+            }  // while (accel_iter.good())
+#else
+            } while (0);
+#endif
+            if (err < 0) {
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Failed to initialize VideoCapture");
                 goto exit_func;
+            }
 
             // checking width/height (since decoder can sometimes alter it, eg. vp6f)
-            if (enc_width && (enc->width != enc_width)) { enc->width = enc_width; }
-            if (enc_height && (enc->height != enc_height)) { enc->height = enc_height; }
+            if (enc_width && (enc->width != enc_width))
+                enc->width = enc_width;
+            if (enc_height && (enc->height != enc_height))
+                enc->height = enc_height;
 
             video_stream = i;
             video_st = ic->streams[i];
@@ -986,7 +1129,8 @@ bool CvCapture_FFMPEG::open( const char* _filename )
         }
     }
 
-    if(video_stream >= 0) valid = true;
+    if (video_stream >= 0)
+        valid = true;
 
 exit_func:
 
@@ -1117,7 +1261,6 @@ bool CvCapture_FFMPEG::processRawPacket()
 bool CvCapture_FFMPEG::grabFrame()
 {
     bool valid = false;
-    int got_picture;
 
     int count_errs = 0;
     const int max_number_of_attempts = 1 << 9;
@@ -1136,6 +1279,11 @@ bool CvCapture_FFMPEG::grabFrame()
     interrupt_metadata.timeout_after_ms = LIBAVFORMAT_INTERRUPT_READ_TIMEOUT_MS;
 #endif
 
+#if USE_AV_SEND_FRAME_API
+    // check if we can receive frame from previously decoded packet
+    valid = avcodec_receive_frame(video_st->codec, picture) >= 0;
+#endif
+
     // get the next frame
     while (!valid)
     {
@@ -1182,16 +1330,24 @@ bool CvCapture_FFMPEG::grabFrame()
         }
 
         // Decode video frame
+#if USE_AV_SEND_FRAME_API
+        if (avcodec_send_packet(video_st->codec, &packet) < 0) {
+            break;
+        }
+        ret = avcodec_receive_frame(video_st->codec, picture);
+#else
+        int got_picture = 0;
         avcodec_decode_video2(video_st->codec, picture, &got_picture, &packet);
-
-        // Did we get a video frame?
-        if(got_picture)
-        {
+        ret = got_picture ? 0 : -1;
+#endif
+        if (ret >= 0) {
             //picture_pts = picture->best_effort_timestamp;
             if( picture_pts == AV_NOPTS_VALUE_ )
                 picture_pts = picture->pkt_pts != AV_NOPTS_VALUE_ && picture->pkt_pts != 0 ? picture->pkt_pts : picture->pkt_dts;
 
             valid = true;
+        } else if (ret == AVERROR(EAGAIN)) {
+            continue;
         }
         else
         {
@@ -1232,7 +1388,20 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
         return p.data != NULL;
     }
 
-    if (!picture->data[0])
+    AVFrame* sw_picture = picture;
+#if USE_AV_HW_CODECS
+    // if hardware frame, copy it to system memory
+    if (picture && picture->hw_frames_ctx) {
+        sw_picture = av_frame_alloc();
+        //if (av_hwframe_map(sw_picture, picture, AV_HWFRAME_MAP_READ) < 0) {
+        if (av_hwframe_transfer_data(sw_picture, picture, 0) < 0) {
+            CV_LOG_ERROR(NULL, "Error copying data from GPU to CPU (av_hwframe_transfer_data)");
+            return false;
+        }
+    }
+#endif
+
+    if (!sw_picture || !sw_picture->data[0])
         return false;
 
     if( img_convert_ctx == NULL ||
@@ -1247,7 +1416,7 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
         img_convert_ctx = sws_getCachedContext(
                 img_convert_ctx,
                 buffer_width, buffer_height,
-                video_st->codec->pix_fmt,
+                (AVPixelFormat)sw_picture->format,
                 buffer_width, buffer_height,
                 AV_PIX_FMT_BGR24,
                 SWS_BICUBIC,
@@ -1285,8 +1454,8 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
 
     sws_scale(
             img_convert_ctx,
-            picture->data,
-            picture->linesize,
+            sw_picture->data,
+            sw_picture->linesize,
             0, video_st->codec->coded_height,
             rgb_picture.data,
             rgb_picture.linesize
@@ -1298,6 +1467,12 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
     *height = frame.height;
     *cn = frame.cn;
 
+#if USE_AV_HW_CODECS
+    if (sw_picture != picture)
+    {
+        av_frame_unref(sw_picture);
+    }
+#endif
     return true;
 }
 
@@ -1324,9 +1499,9 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
     case CAP_PROP_FRAME_COUNT:
         return (double)get_total_frames();
     case CAP_PROP_FRAME_WIDTH:
-        return (double)((rotation_auto && rotation_angle%180) ? frame.height : frame.width);
+        return (double)((rotation_auto && ((rotation_angle%180) != 0)) ? frame.height : frame.width);
     case CAP_PROP_FRAME_HEIGHT:
-        return (double)((rotation_auto && rotation_angle%180) ? frame.width : frame.height);
+        return (double)((rotation_auto && ((rotation_angle%180) != 0)) ? frame.width : frame.height);
     case CAP_PROP_FPS:
         return get_fps();
     case CAP_PROP_FOURCC:
@@ -1369,6 +1544,12 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
 #else
         return 0;
 #endif
+#if USE_AV_HW_CODECS
+    case CAP_PROP_HW_ACCELERATION:
+        return static_cast<double>(va_type);
+    case CAP_PROP_HW_DEVICE:
+        return static_cast<double>(hw_device);
+#endif  // USE_AV_HW_CODECS
     default:
         break;
     }
@@ -1550,10 +1731,10 @@ bool CvCapture_FFMPEG::setProperty( int property_id, double value )
         return false;
     case CAP_PROP_ORIENTATION_AUTO:
 #if LIBAVUTIL_BUILD >= CALC_FFMPEG_VERSION(52, 94, 100)
-        rotation_auto = static_cast<bool>(value);
+        rotation_auto = value != 0 ? true : false;
         return true;
 #else
-        rotation_auto = 0;
+        rotation_auto = false;
         return false;
 #endif
     default:
@@ -1568,9 +1749,10 @@ bool CvCapture_FFMPEG::setProperty( int property_id, double value )
 struct CvVideoWriter_FFMPEG
 {
     bool open( const char* filename, int fourcc,
-               double fps, int width, int height, bool isColor );
+               double fps, int width, int height, const VideoWriterParameters& params );
     void close();
     bool writeFrame( const unsigned char* data, int step, int width, int height, int cn, int origin );
+    double getProperty(int propId) const;
 
     void init();
 
@@ -1583,13 +1765,15 @@ struct CvVideoWriter_FFMPEG
     AVFrame         * input_picture;
     uint8_t         * picbuf;
     AVStream        * video_st;
-    int               input_pix_fmt;
+    AVPixelFormat     input_pix_fmt;
     unsigned char   * aligned_input;
     size_t            aligned_input_size;
     int               frame_width, frame_height;
     int               frame_idx;
     bool              ok;
     struct SwsContext *img_convert_ctx;
+    VideoAccelerationType va_type;
+    int               hw_device;
 };
 
 static const char * icvFFMPEGErrStr(int err)
@@ -1644,12 +1828,14 @@ void CvVideoWriter_FFMPEG::init()
     input_picture = 0;
     picbuf = 0;
     video_st = 0;
-    input_pix_fmt = 0;
+    input_pix_fmt = AV_PIX_FMT_NONE;
     aligned_input = NULL;
     aligned_input_size = 0;
     img_convert_ctx = 0;
     frame_width = frame_height = 0;
     frame_idx = 0;
+    va_type = VIDEO_ACCELERATION_NONE;
+    hw_device = -1;
     ok = false;
 }
 
@@ -1691,35 +1877,17 @@ static AVFrame * icv_alloc_picture_FFMPEG(int pix_fmt, int width, int height, bo
     return picture;
 }
 
-/* add a video output stream to the container */
-static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
-                                             CV_CODEC_ID codec_id,
-                                             int w, int h, int bitrate,
-                                             double fps, int pixel_format)
+/* configure video stream */
+static bool icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
+                                                   AVStream *st,
+                                                   const AVCodec* codec,
+                                                   int w, int h, int bitrate,
+                                                   double fps, AVPixelFormat pixel_format)
 {
-    AVCodecContext *c;
-    AVStream *st;
+    AVCodecContext *c = st->codec;
     int frame_rate, frame_rate_base;
-    AVCodec *codec;
-
-    st = avformat_new_stream(oc, 0);
-
-    if (!st) {
-        CV_WARN("Could not allocate stream");
-        return NULL;
-    }
-
-    c = st->codec;
-
-    c->codec_id = av_guess_codec(oc->oformat, NULL, oc->filename, NULL, AVMEDIA_TYPE_VIDEO);
-
-    if(codec_id != CV_CODEC(CODEC_ID_NONE)){
-        c->codec_id = codec_id;
-    }
-
-    //if(codec_tag) c->codec_tag=codec_tag;
-    codec = avcodec_find_encoder(c->codec_id);
 
+    c->codec_id = codec->id;
     c->codec_type = AVMEDIA_TYPE_VIDEO;
 
     // Set per-codec defaults
@@ -1769,13 +1937,13 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
             }
         }
         if (best == NULL)
-            return NULL;
+            return false;
         c->time_base.den= best->num;
         c->time_base.num= best->den;
     }
 
     c->gop_size = 12; /* emit one intra frame every twelve frames at most */
-    c->pix_fmt = (AVPixelFormat) pixel_format;
+    c->pix_fmt = pixel_format;
 
     if (c->codec_id == CV_CODEC(CODEC_ID_MPEG2VIDEO)) {
         c->max_b_frames = 2;
@@ -1822,14 +1990,14 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
     st->time_base = c->time_base;
 #endif
 
-    return st;
+    return true;
 }
 
 static const int OPENCV_NO_FRAMES_WRITTEN_CODE = 1000;
 
 static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
                                       uint8_t *, uint32_t,
-                                      AVFrame * picture )
+                                      AVFrame * picture, int frame_idx)
 {
     AVCodecContext* c = video_st->codec;
     int ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
@@ -1854,7 +2022,13 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
     {
         /* encode the image */
 #if USE_AV_SEND_FRAME_API
-        ret = avcodec_send_frame(c, picture);
+        if (picture == NULL && frame_idx == 0) {
+            ret = 0;
+        } else {
+            ret = avcodec_send_frame(c, picture);
+            if (ret < 0)
+                CV_LOG_ERROR(NULL, "Error sending frame to encoder (avcodec_send_frame)");
+        }
         while (ret >= 0)
         {
             AVPacket* pkt = av_packet_alloc();
@@ -1873,6 +2047,7 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
             break;
         }
 #else
+        CV_UNUSED(frame_idx);
         AVPacket pkt;
         av_init_packet(&pkt);
         int got_output = 0;
@@ -1933,7 +2108,7 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
     // 2. (dataend - SIMD_SIZE) and (dataend + SIMD_SIZE) is from the same 4k page
     const int CV_STEP_ALIGNMENT = 32;
     const size_t CV_SIMD_SIZE = 32;
-    const size_t CV_PAGE_MASK = ~(4096 - 1);
+    const size_t CV_PAGE_MASK = ~(size_t)(4096 - 1);
     const unsigned char* dataend = data + ((size_t)height * step);
     if (step % CV_STEP_ALIGNMENT != 0 ||
         (((size_t)dataend - CV_SIMD_SIZE) & CV_PAGE_MASK) != (((size_t)dataend + CV_SIMD_SIZE) & CV_PAGE_MASK))
@@ -1961,7 +2136,12 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
         step = aligned_step;
     }
 
-    if ( c->pix_fmt != input_pix_fmt ) {
+    AVPixelFormat sw_pix_fmt = c->pix_fmt;
+#if USE_AV_HW_CODECS
+    if (c->hw_frames_ctx)
+        sw_pix_fmt = ((AVHWFramesContext*)c->hw_frames_ctx->data)->sw_format;
+#endif
+    if ( sw_pix_fmt != input_pix_fmt ) {
         assert( input_picture );
         // let input_picture point to the raw data buffer of 'image'
         _opencv_ffmpeg_av_image_fill_arrays(input_picture, (uint8_t *) data,
@@ -1975,7 +2155,7 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
                                              (AVPixelFormat)input_pix_fmt,
                                              c->width,
                                              c->height,
-                                             c->pix_fmt,
+                                             sw_pix_fmt,
                                              SWS_BICUBIC,
                                              NULL, NULL, NULL);
             if( !img_convert_ctx )
@@ -1994,13 +2174,58 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
         picture->linesize[0] = step;
     }
 
-    picture->pts = frame_idx;
-    bool ret = icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, picture) >= 0;
+    bool ret;
+#if USE_AV_HW_CODECS
+    if (video_st->codec->hw_device_ctx) {
+        // copy data to HW frame
+        AVFrame* hw_frame = av_frame_alloc();
+        if (!hw_frame) {
+            CV_LOG_ERROR(NULL, "Error allocating AVFrame (av_frame_alloc)");
+            return false;
+        }
+        if (av_hwframe_get_buffer(video_st->codec->hw_frames_ctx, hw_frame, 0) < 0) {
+            CV_LOG_ERROR(NULL, "Error obtaining HW frame (av_hwframe_get_buffer)");
+            av_frame_free(&hw_frame);
+            return false;
+        }
+        if (av_hwframe_transfer_data(hw_frame, picture, 0) < 0) {
+            CV_LOG_ERROR(NULL, "Error copying data from CPU to GPU (av_hwframe_transfer_data)");
+            av_frame_free(&hw_frame);
+            return false;
+        }
+        hw_frame->pts = frame_idx;
+        int ret_write = icv_av_write_frame_FFMPEG(oc, video_st, outbuf, outbuf_size, hw_frame, frame_idx);
+        ret = ret_write >= 0 ? true : false;
+        av_frame_free(&hw_frame);
+    } else
+#endif
+    {
+        picture->pts = frame_idx;
+        int ret_write = icv_av_write_frame_FFMPEG(oc, video_st, outbuf, outbuf_size, picture, frame_idx);
+        ret = ret_write >= 0 ? true : false;
+    }
+
     frame_idx++;
 
     return ret;
 }
 
+double CvVideoWriter_FFMPEG::getProperty(int propId) const
+{
+    CV_UNUSED(propId);
+#if USE_AV_HW_CODECS
+    if (propId == VIDEOWRITER_PROP_HW_ACCELERATION)
+    {
+        return static_cast<double>(va_type);
+    }
+    else if (propId == VIDEOWRITER_PROP_HW_DEVICE)
+    {
+        return static_cast<double>(hw_device);
+    }
+#endif
+    return 0;
+}
+
 /// close video output stream and free associated memory
 void CvVideoWriter_FFMPEG::close()
 {
@@ -2022,7 +2247,7 @@ void CvVideoWriter_FFMPEG::close()
         {
             for(;;)
             {
-                int ret = icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, NULL);
+                int ret = icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, NULL, frame_idx);
                 if( ret == OPENCV_NO_FRAMES_WRITTEN_CODE || ret < 0 )
                     break;
             }
@@ -2112,15 +2337,51 @@ static inline void cv_ff_codec_tag_dump(const AVCodecTag *const *tags)
 
 /// Create a video writer object that uses FFMPEG
 bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
-                                 double fps, int width, int height, bool is_color )
+                                 double fps, int width, int height, const VideoWriterParameters& params)
 {
     InternalFFMpegRegister::init();
+
+    AutoLock lock(_mutex);
+
     CV_CODEC_ID codec_id = CV_CODEC(CODEC_ID_NONE);
-    int err, codec_pix_fmt;
+    AVPixelFormat codec_pix_fmt;
     double bitrate_scale = 1;
 
     close();
 
+    const bool is_color = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
+    if (params.has(VIDEOWRITER_PROP_HW_ACCELERATION))
+    {
+        va_type = params.get<VideoAccelerationType>(VIDEOWRITER_PROP_HW_ACCELERATION, VIDEO_ACCELERATION_NONE);
+#if !USE_AV_HW_CODECS
+        if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle VIDEOWRITER_PROP_HW_ACCELERATION parameter. Bailout");
+            return false;
+        }
+#endif
+    }
+    if (params.has(VIDEOWRITER_PROP_HW_DEVICE))
+    {
+        hw_device = params.get<int>(VIDEOWRITER_PROP_HW_DEVICE, -1);
+        if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+            return false;
+        }
+        if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+            return false;
+        }
+    }
+
+    if (params.warnUnusedParameters())
+    {
+        CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: unsupported parameters in VideoWriter, see logger INFO channel for details");
+        return false;
+    }
+
     // check arguments
     if( !filename )
         return false;
@@ -2173,7 +2434,18 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 #endif
                 codec_bmp_tags, // fallback for avformat < 54.1
                 NULL };
-        if( (codec_id = av_codec_get_id(fallback_tags, fourcc)) == CV_CODEC(CODEC_ID_NONE) )
+        if (codec_id == CV_CODEC(CODEC_ID_NONE)) {
+            codec_id = av_codec_get_id(fallback_tags, fourcc);
+        }
+        if (codec_id == CV_CODEC(CODEC_ID_NONE)) {
+            char *p = (char *) &fourcc;
+            char name[] = {(char)tolower(p[0]), (char)tolower(p[1]), (char)tolower(p[2]), (char)tolower(p[3]), 0};
+            const AVCodecDescriptor *desc = avcodec_descriptor_get_by_name(name);
+            if (desc)
+                codec_id = desc->id;
+        }
+
+        if (codec_id == CV_CODEC(CODEC_ID_NONE))
         {
             fflush(stdout);
             fprintf(stderr, "OpenCV: FFMPEG: tag 0x%08x/'%c%c%c%c' is not found (format '%s / %s')'\n",
@@ -2321,45 +2593,133 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 
     double bitrate = std::min(bitrate_scale*fps*width*height, (double)INT_MAX/2);
 
-    // TODO -- safe to ignore output audio stream?
-    video_st = icv_add_video_stream_FFMPEG(oc, codec_id,
-                                           width, height, (int)(bitrate + 0.5),
-                                           fps, codec_pix_fmt);
+    if (codec_id == AV_CODEC_ID_NONE) {
+        codec_id = av_guess_codec(oc->oformat, NULL, oc->filename, NULL, AVMEDIA_TYPE_VIDEO);
+    }
+
+    // Add video stream to output file
+    video_st = avformat_new_stream(oc, 0);
+    if (!video_st) {
+        CV_WARN("Could not allocate stream");
+        return false;
+    }
+
+    AVDictionary *dict = NULL;
+#if !defined(NO_GETENV) && (LIBAVUTIL_VERSION_MAJOR >= 53)
+    char* options = getenv("OPENCV_FFMPEG_WRITER_OPTIONS");
+    if (options) {
+        av_dict_parse_string(&dict, options, ";", "|", 0);
+    }
+#endif
+
+    AVCodecContext *c = video_st->codec;
+
+    // find and open encoder, try HW acceleration types specified in 'hw_acceleration' list (in order)
+    int err = -1;
+    AVCodec* codec = NULL;
+#if USE_AV_HW_CODECS
+    AVBufferRef* hw_device_ctx = NULL;
+    HWAccelIterator accel_iter(va_type, true/*isEncoder*/, dict);
+    while (accel_iter.good())
+    {
+#else
+    do {
+#endif
+#if USE_AV_HW_CODECS
+        accel_iter.parse_next();
+        AVHWDeviceType hw_type = accel_iter.hw_type();
+        codec = NULL;
+        AVPixelFormat hw_format = AV_PIX_FMT_NONE;
+        if (hw_device_ctx)
+            av_buffer_unref(&hw_device_ctx);
+        if (hw_type != AV_HWDEVICE_TYPE_NONE)
+        {
+            codec = hw_find_codec(codec_id, hw_type, av_codec_is_encoder, accel_iter.disabled_codecs().c_str(), &hw_format);
+            if (!codec)
+                continue;
+
+            hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname());
+            if (!hw_device_ctx)
+                continue;
+        }
+        else if (hw_type == AV_HWDEVICE_TYPE_NONE)
+#endif
+        {
+            codec = avcodec_find_encoder(codec_id);
+            if (!codec) {
+                CV_LOG_ERROR(NULL, "Could not find encoder for codec_id=" << (int)codec_id << ", error: "
+                        << icvFFMPEGErrStr(AVERROR_ENCODER_NOT_FOUND));
+            }
+        }
+        if (!codec)
+            continue;
+#if USE_AV_HW_CODECS
+        AVPixelFormat format = (hw_format != AV_PIX_FMT_NONE) ? hw_format : codec_pix_fmt;
+#else
+        AVPixelFormat format = codec_pix_fmt;
+#endif
+
+        if (!icv_configure_video_stream_FFMPEG(oc, video_st, codec,
+                                               width, height, (int) (bitrate + 0.5),
+                                               fps, format)) {
+            continue;
+        }
 
 #if 0
 #if FF_API_DUMP_FORMAT
-    dump_format(oc, 0, filename, 1);
+        dump_format(oc, 0, filename, 1);
 #else
-    av_dump_format(oc, 0, filename, 1);
+        av_dump_format(oc, 0, filename, 1);
 #endif
 #endif
 
-    /* now that all the parameters are set, we can open the audio and
-     video codecs and allocate the necessary encode buffers */
-    if (!video_st){
-        return false;
-    }
+        c->codec_tag = fourcc;
 
-    AVCodecContext* c  = video_st->codec;
+#if USE_AV_HW_CODECS
+        if (hw_device_ctx) {
+            c->hw_device_ctx = av_buffer_ref(hw_device_ctx);
+            if (hw_format != AV_PIX_FMT_NONE) {
+                c->hw_frames_ctx = hw_create_frames(NULL, hw_device_ctx, width, height, hw_format);
+                if (!c->hw_frames_ctx)
+                    continue;
+            }
+        }
+#endif
 
-    c->codec_tag = fourcc;
-    /* find the video encoder */
-    AVCodec* codec = avcodec_find_encoder(c->codec_id);
-    if (!codec) {
-        fprintf(stderr, "Could not find encoder for codec id %d: %s\n", c->codec_id,
-                icvFFMPEGErrStr(AVERROR_ENCODER_NOT_FOUND));
-        return false;
-    }
+        int64_t lbit_rate = (int64_t) c->bit_rate;
+        lbit_rate += (int64_t)(bitrate / 2);
+        lbit_rate = std::min(lbit_rate, (int64_t) INT_MAX);
+        c->bit_rate_tolerance = (int) lbit_rate;
+        c->bit_rate = (int) lbit_rate;
 
-    int64_t lbit_rate = (int64_t)c->bit_rate;
-    lbit_rate += (bitrate / 2);
-    lbit_rate = std::min(lbit_rate, (int64_t)INT_MAX);
-    c->bit_rate_tolerance = (int)lbit_rate;
-    c->bit_rate = (int)lbit_rate;
+        /* open the codec */
+        err = avcodec_open2(c, codec, NULL);
+        if (err >= 0) {
+#if USE_AV_HW_CODECS
+            va_type = hw_type_to_va_type(hw_type);
+            if (hw_type != AV_HWDEVICE_TYPE_NONE && hw_device < 0)
+                hw_device = 0;
+#endif
+            break;
+        } else {
+            CV_LOG_ERROR(NULL, "Could not open codec " << codec->name << ", error: " << icvFFMPEGErrStr(err));
+        }
+#if USE_AV_HW_CODECS
+    }  // while (accel_iter.good())
+#else
+    } while (0);
+#endif
 
-    /* open the codec */
-    if ((err= avcodec_open2(c, codec, NULL)) < 0) {
-        fprintf(stderr, "Could not open codec '%s': %s\n", codec->name, icvFFMPEGErrStr(err));
+#if USE_AV_HW_CODECS
+    if (hw_device_ctx)
+        av_buffer_unref(&hw_device_ctx);
+#endif
+
+    if (dict != NULL)
+        av_dict_free(&dict);
+
+    if (err < 0) {
+        CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Failed to initialize VideoWriter");
         return false;
     }
 
@@ -2377,10 +2737,16 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     }
 
     bool need_color_convert;
-    need_color_convert = (c->pix_fmt != input_pix_fmt);
+    AVPixelFormat sw_pix_fmt = c->pix_fmt;
+#if USE_AV_HW_CODECS
+    if (c->hw_frames_ctx)
+        sw_pix_fmt = ((AVHWFramesContext*)c->hw_frames_ctx->data)->sw_format;
+#endif
+
+    need_color_convert = (sw_pix_fmt != input_pix_fmt);
 
     /* allocate the encoded raw picture */
-    picture = icv_alloc_picture_FFMPEG(c->pix_fmt, c->width, c->height, need_color_convert);
+    picture = icv_alloc_picture_FFMPEG(sw_pix_fmt, c->width, c->height, need_color_convert);
     if (!picture) {
         return false;
     }
@@ -2424,13 +2790,15 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 
 
 
-CvCapture_FFMPEG* cvCreateFileCapture_FFMPEG( const char* filename )
+static
+CvCapture_FFMPEG* cvCreateFileCaptureWithParams_FFMPEG(const char* filename, const VideoCaptureParameters& params)
 {
+    // FIXIT: remove unsafe malloc() approach
     CvCapture_FFMPEG* capture = (CvCapture_FFMPEG*)malloc(sizeof(*capture));
     if (!capture)
         return 0;
     capture->init();
-    if( capture->open( filename ))
+    if (capture->open(filename, params))
         return capture;
 
     capture->close();
@@ -2438,7 +2806,6 @@ CvCapture_FFMPEG* cvCreateFileCapture_FFMPEG( const char* filename )
     return 0;
 }
 
-
 void cvReleaseCapture_FFMPEG(CvCapture_FFMPEG** capture)
 {
     if( capture && *capture )
@@ -2469,20 +2836,28 @@ int cvRetrieveFrame_FFMPEG(CvCapture_FFMPEG* capture, unsigned char** data, int*
     return capture->retrieveFrame(0, data, step, width, height, cn);
 }
 
-CvVideoWriter_FFMPEG* cvCreateVideoWriter_FFMPEG( const char* filename, int fourcc, double fps,
-                                                  int width, int height, int isColor )
+static CvVideoWriter_FFMPEG* cvCreateVideoWriterWithParams_FFMPEG( const char* filename, int fourcc, double fps,
+                                                  int width, int height, const VideoWriterParameters& params )
 {
     CvVideoWriter_FFMPEG* writer = (CvVideoWriter_FFMPEG*)malloc(sizeof(*writer));
     if (!writer)
         return 0;
     writer->init();
-    if( writer->open( filename, fourcc, fps, width, height, isColor != 0 ))
+    if( writer->open( filename, fourcc, fps, width, height, params ))
         return writer;
     writer->close();
     free(writer);
     return 0;
 }
 
+CvVideoWriter_FFMPEG* cvCreateVideoWriter_FFMPEG( const char* filename, int fourcc, double fps,
+                                                  int width, int height, int isColor )
+{
+    VideoWriterParameters params;
+    params.add(VIDEOWRITER_PROP_IS_COLOR, isColor);
+    return cvCreateVideoWriterWithParams_FFMPEG(filename, fourcc, fps, width, height, params);
+}
+
 void cvReleaseVideoWriter_FFMPEG( CvVideoWriter_FFMPEG** writer )
 {
     if( writer && *writer )
diff --git a/modules/videoio/src/cap_ffmpeg_legacy_api.hpp b/modules/videoio/src/cap_ffmpeg_legacy_api.hpp
index d918765e2e..09ef0fb203 100644
--- a/modules/videoio/src/cap_ffmpeg_legacy_api.hpp
+++ b/modules/videoio/src/cap_ffmpeg_legacy_api.hpp
@@ -24,7 +24,7 @@ extern "C"
 typedef struct CvCapture_FFMPEG CvCapture_FFMPEG;
 typedef struct CvVideoWriter_FFMPEG CvVideoWriter_FFMPEG;
 
-OPENCV_FFMPEG_API struct CvCapture_FFMPEG* cvCreateFileCapture_FFMPEG(const char* filename);
+//OPENCV_FFMPEG_API struct CvCapture_FFMPEG* cvCreateFileCapture_FFMPEG(const char* filename);
 OPENCV_FFMPEG_API int cvSetCaptureProperty_FFMPEG(struct CvCapture_FFMPEG* cap,
                                                   int prop, double value);
 OPENCV_FFMPEG_API double cvGetCaptureProperty_FFMPEG(struct CvCapture_FFMPEG* cap, int prop);
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index 67c119cd36..60ecf6611a 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -177,6 +177,20 @@ private:
     GSafePtr& operator=(const T*); // = disabled
 };
 
+class ScopeGuardGstMapInfo
+{
+    GstBuffer* buf_;
+    GstMapInfo* info_;
+public:
+    ScopeGuardGstMapInfo(GstBuffer* buf, GstMapInfo* info)
+        : buf_(buf), info_(info)
+    {}
+    ~ScopeGuardGstMapInfo()
+    {
+        gst_buffer_unmap(buf_, info_);
+    }
+};
+
 } // namespace
 
 /*!
@@ -267,6 +281,22 @@ bool is_gst_element_exists(const std::string& name)
     return (bool)testfac;
 }
 
+static void find_hw_element(const GValue *item, gpointer va_type)
+{
+    GstElement *element = GST_ELEMENT(g_value_get_object(item));
+    const gchar *name = g_type_name(G_OBJECT_TYPE(element));
+    if (name) {
+        std::string name_lower = toLowerCase(name);
+        if (name_lower.find("vaapi") != std::string::npos) {
+            *(int*)va_type = VIDEO_ACCELERATION_VAAPI;
+        } else if (name_lower.find("mfx") != std::string::npos || name_lower.find("msdk") != std::string::npos) {
+            *(int*)va_type = VIDEO_ACCELERATION_MFX;
+        } else if (name_lower.find("d3d11") != std::string::npos) {
+            *(int*)va_type = VIDEO_ACCELERATION_D3D11;
+        }
+    }
+}
+
 //==================================================================================================
 
 class GStreamerCapture CV_FINAL : public IVideoCapture
@@ -286,6 +316,8 @@ private:
     bool          isPosFramesEmulated;
     gint64        emulatedFrameNumber;
 
+    VideoAccelerationType va_type;
+    int hw_device;
 public:
     GStreamerCapture();
     virtual ~GStreamerCapture() CV_OVERRIDE;
@@ -295,12 +327,11 @@ public:
     virtual bool setProperty(int propId, double value) CV_OVERRIDE;
     virtual bool isOpened() const CV_OVERRIDE { return (bool)pipeline; }
     virtual int getCaptureDomain() CV_OVERRIDE { return cv::CAP_GSTREAMER; }
-    bool open(int id);
-    bool open(const String &filename_);
+    bool open(int id, const cv::VideoCaptureParameters& params);
+    bool open(const String &filename_, const cv::VideoCaptureParameters& params);
     static void newPad(GstElement * /*elem*/, GstPad     *pad, gpointer    data);
 
 protected:
-    bool determineFrameDims(CV_OUT Size& sz, CV_OUT gint& channels, CV_OUT bool& isOutputByteBuffer);
     bool isPipelinePlaying();
     void startPipeline();
     void stopPipeline();
@@ -314,6 +345,8 @@ GStreamerCapture::GStreamerCapture() :
     isPosFramesSupported(false),
     isPosFramesEmulated(false),
     emulatedFrameNumber(-1)
+    , va_type(VIDEO_ACCELERATION_NONE)
+    , hw_device(-1)
 {
 }
 
@@ -369,72 +402,68 @@ bool GStreamerCapture::grabFrame()
 bool GStreamerCapture::retrieveFrame(int, OutputArray dst)
 {
     if (!sample)
+    {
         return false;
-    Size sz;
-    gint channels = 0;
-    bool isOutputByteBuffer = false;
-    if (!determineFrameDims(sz, channels, isOutputByteBuffer))
+    }
+
+    GstCaps* frame_caps = gst_sample_get_caps(sample);  // no lifetime transfer
+    if (!frame_caps)
+    {
+        CV_LOG_ERROR(NULL, "GStreamer: gst_sample_get_caps() returns NULL");
         return false;
+    }
+
+    if (!GST_CAPS_IS_SIMPLE(frame_caps))
+    {
+        // bail out in no caps
+        CV_LOG_ERROR(NULL, "GStreamer: GST_CAPS_IS_SIMPLE(frame_caps) check is failed");
+        return false;
+    }
+
+    GstVideoInfo info = {};
+    gboolean video_info_res = gst_video_info_from_caps(&info, frame_caps);
+    if (!video_info_res)
+    {
+        CV_Error(Error::StsError, "GStreamer: gst_video_info_from_caps() is failed. Can't handle unknown layout");
+    }
+
+    int frame_width = GST_VIDEO_INFO_WIDTH(&info);
+    int frame_height = GST_VIDEO_INFO_HEIGHT(&info);
+    if (frame_width <= 0 || frame_height <= 0)
+    {
+        CV_LOG_ERROR(NULL, "GStreamer: Can't query frame size from GStreamer sample");
+        return false;
+    }
+
+    GstStructure* structure = gst_caps_get_structure(frame_caps, 0);  // no lifetime transfer
+    if (!structure)
+    {
+        CV_LOG_ERROR(NULL, "GStreamer: Can't query 'structure'-0 from GStreamer sample");
+        return false;
+    }
+
+    const gchar* name_ = gst_structure_get_name(structure);
+    if (!name_)
+    {
+        CV_LOG_ERROR(NULL, "GStreamer: Can't query 'name' from GStreamer sample");
+        return false;
+    }
+    std::string name = toLowerCase(std::string(name_));
 
     // gstreamer expects us to handle the memory at this point
     // so we can just wrap the raw buffer and be done with it
     GstBuffer* buf = gst_sample_get_buffer(sample);  // no lifetime transfer
     if (!buf)
         return false;
-    GstMapInfo info = {};
-    if (!gst_buffer_map(buf, &info, GST_MAP_READ))
+    GstMapInfo map_info = {};
+    if (!gst_buffer_map(buf, &map_info, GST_MAP_READ))
     {
-        //something weird went wrong here. abort. abort.
-        CV_WARN("Failed to map GStreamer buffer to system memory");
+        CV_LOG_ERROR(NULL, "GStreamer: Failed to map GStreamer buffer to system memory");
         return false;
     }
+    ScopeGuardGstMapInfo map_guard(buf, &map_info);  // call gst_buffer_unmap(buf, &map_info) on scope leave
 
-    try
-    {
-        Mat src;
-        if (isOutputByteBuffer)
-            src = Mat(Size(info.size, 1), CV_8UC1, info.data);
-        else
-            src = Mat(sz, CV_MAKETYPE(CV_8U, channels), info.data);
-        CV_Assert(src.isContinuous());
-        src.copyTo(dst);
-    }
-    catch (...)
-    {
-        gst_buffer_unmap(buf, &info);
-        throw;
-    }
-    gst_buffer_unmap(buf, &info);
-
-    return true;
-}
-
-bool GStreamerCapture::determineFrameDims(Size &sz, gint& channels, bool& isOutputByteBuffer)
-{
-    GstCaps * frame_caps = gst_sample_get_caps(sample);  // no lifetime transfer
-
-    // bail out in no caps
-    if (!GST_CAPS_IS_SIMPLE(frame_caps))
-        return false;
-
-    GstStructure* structure = gst_caps_get_structure(frame_caps, 0);  // no lifetime transfer
-
-    // bail out if width or height are 0
-    if (!gst_structure_get_int(structure, "width", &width)
-        || !gst_structure_get_int(structure, "height", &height))
-    {
-        CV_WARN("Can't query frame size from GStreeamer buffer");
-        return false;
-    }
-
-    sz = Size(width, height);
-
-    const gchar* name_ = gst_structure_get_name(structure);
-    if (!name_)
-        return false;
-    std::string name = toLowerCase(std::string(name_));
-
-    // we support 11 types of data:
+    // we support these types of data:
     //     video/x-raw, format=BGR   -> 8bit, 3 channels
     //     video/x-raw, format=GRAY8 -> 8bit, 1 channel
     //     video/x-raw, format=UYVY  -> 8bit, 2 channel
@@ -448,50 +477,117 @@ bool GStreamerCapture::determineFrameDims(Size &sz, gint& channels, bool& isOutp
     //     image/jpeg                -> 8bit, mjpeg: buffer_size x 1 x 1
     // bayer data is never decoded, the user is responsible for that
     // everything is 8 bit, so we just test the caps for bit depth
+    Size sz = Size(frame_width, frame_height);
+    guint n_planes = GST_VIDEO_INFO_N_PLANES(&info);
     if (name == "video/x-raw")
     {
         const gchar* format_ = gst_structure_get_string(structure, "format");
         if (!format_)
+        {
+            CV_LOG_ERROR(NULL, "GStreamer: Can't query 'format' of 'video/x-raw'");
             return false;
+        }
         std::string format = toUpperCase(std::string(format_));
 
         if (format == "BGR")
         {
-            channels = 3;
-        }
-        else if (format == "UYVY" || format == "YUY2" || format == "YVYU")
-        {
-            channels = 2;
-        }
-        else if (format == "NV12" || format == "NV21" || format == "YV12" || format == "I420")
-        {
-            channels = 1;
-            sz.height = sz.height * 3 / 2;
+            CV_CheckEQ((int)n_planes, 1, "");
+            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step, (size_t)frame_width * 3, "");
+            Mat src(sz, CV_8UC3, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            src.copyTo(dst);
+            return true;
         }
         else if (format == "GRAY8")
         {
-            channels = 1;
+            CV_CheckEQ((int)n_planes, 1, "");
+            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step, (size_t)frame_width, "");
+            Mat src(sz, CV_8UC1, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            src.copyTo(dst);
+            return true;
+        }
+        else if (format == "UYVY" || format == "YUY2" || format == "YVYU")
+        {
+            CV_CheckEQ((int)n_planes, 1, "");
+            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step, (size_t)frame_width * 2, "");
+            Mat src(sz, CV_8UC2, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            src.copyTo(dst);
+            return true;
+        }
+        else if (format == "NV12" || format == "NV21")
+        {
+            CV_CheckEQ((int)n_planes, 2, "");
+            size_t stepY = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(stepY, (size_t)frame_width, "");
+            size_t stepUV = GST_VIDEO_INFO_PLANE_STRIDE(&info, 1);
+            CV_CheckGE(stepUV, (size_t)frame_width, "");
+            size_t offsetY = GST_VIDEO_INFO_PLANE_OFFSET(&info, 0);
+            size_t offsetUV = GST_VIDEO_INFO_PLANE_OFFSET(&info, 1);
+            if (stepY != stepUV || (offsetUV - offsetY) != (stepY * frame_height))
+            {
+                dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
+                Mat dst_ = dst.getMat();
+                Mat srcY(sz, CV_8UC1, map_info.data + offsetY, stepY);
+                Mat srcUV(Size(frame_width, frame_height / 2), CV_8UC1, map_info.data + offsetUV, stepUV);
+                srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
+                srcUV.copyTo(dst_(Rect(0, frame_height, frame_width, frame_height / 2)));
+            }
+            else
+            {
+                Mat src(Size(frame_width, frame_height * 3 / 2), CV_8UC1, map_info.data + offsetY, stepY);
+                src.copyTo(dst);
+            }
+            return true;
+        }
+        else if (format == "YV12" || format == "I420")
+        {
+            CV_CheckEQ((int)n_planes, 3, "");
+            size_t step0 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step0, (size_t)frame_width, "");
+            size_t step1 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 1);
+            CV_CheckGE(step1, (size_t)frame_width / 2, "");
+            size_t step2 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 2);
+            CV_CheckGE(step2, (size_t)frame_width / 2, "");
+
+            size_t offset0 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 0);
+            size_t offset1 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 1);
+            size_t offset2 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 2);
+            {
+                dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
+                Mat dst_ = dst.getMat();
+                Mat srcY(sz, CV_8UC1, map_info.data + offset0, step0);
+                Size sz2(frame_width / 2, frame_height / 2);
+                Mat src1(sz2, CV_8UC1, map_info.data + offset1, step1);
+                Mat src2(sz2, CV_8UC1, map_info.data + offset2, step2);
+                srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
+                src1.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height)));
+                src2.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height) + src1.total()));
+            }
+            return true;
         }
         else
         {
-            CV_Error_(Error::StsNotImplemented, ("Unsupported GStreamer format: %s", format.c_str()));
+            CV_Error_(Error::StsNotImplemented, ("Unsupported GStreamer 'video/x-raw' format: %s", format.c_str()));
         }
     }
     else if (name == "video/x-bayer")
     {
-        channels = 1;
+        CV_CheckEQ((int)n_planes, 0, "");
+        Mat src = Mat(sz, CV_8UC1, map_info.data);
+        src.copyTo(dst);
+        return true;
     }
     else if (name == "image/jpeg")
     {
-        // the correct size will be set once the first frame arrives
-        channels = 1;
-        isOutputByteBuffer = true;
+        CV_CheckEQ((int)n_planes, 0, "");
+        Mat src = Mat(Size(map_info.size, 1), CV_8UC1, map_info.data);
+        src.copyTo(dst);
+        return true;
     }
-    else
-    {
-        CV_Error_(Error::StsNotImplemented, ("Unsupported GStreamer layer type: %s", name.c_str()));
-    }
-    return true;
+
+    CV_Error_(Error::StsNotImplemented, ("Unsupported GStreamer layer type: %s", name.c_str()));
 }
 
 bool GStreamerCapture::isPipelinePlaying()
@@ -678,7 +774,7 @@ void GStreamerCapture::newPad(GstElement *, GstPad *pad, gpointer data)
  *  is really slow if we need to restart the pipeline over and over again.
  *
  */
-bool GStreamerCapture::open(int id)
+bool GStreamerCapture::open(int id, const cv::VideoCaptureParameters& params)
 {
     gst_initializer::init();
 
@@ -688,13 +784,37 @@ bool GStreamerCapture::open(int id)
     desc << "v4l2src device=/dev/video" << id
              << " ! " << COLOR_ELEM
              << " ! appsink drop=true";
-    return open(desc.str());
+    return open(desc.str(), params);
 }
 
-bool GStreamerCapture::open(const String &filename_)
+bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParameters& params)
 {
     gst_initializer::init();
 
+    if (params.has(CAP_PROP_HW_ACCELERATION))
+    {
+        va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
+    }
+    if (params.has(CAP_PROP_HW_DEVICE))
+    {
+        hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
+        if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+            return false;
+        }
+        if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+            return false;
+        }
+        if (hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: CAP_PROP_HW_DEVICE is not supported. Specify -1 (auto) value. Bailout");
+            return false;
+        }
+    }
+
     const gchar* filename = filename_.c_str();
 
     bool file = false;
@@ -870,6 +990,11 @@ bool GStreamerCapture::open(const String &filename_)
         gst_app_sink_set_max_buffers(GST_APP_SINK(sink.get()), 1);
     }
 
+    if (!manualpipeline)
+    {
+        gst_base_sink_set_sync(GST_BASE_SINK(sink.get()), FALSE);
+    }
+
     //do not emit signals: all calls will be synchronous and blocking
     gst_app_sink_set_emit_signals (GST_APP_SINK(sink.get()), FALSE);
 
@@ -965,6 +1090,35 @@ bool GStreamerCapture::open(const String &filename_)
         GST_DEBUG_BIN_TO_DOT_FILE(GST_BIN(pipeline.get()), GST_DEBUG_GRAPH_SHOW_ALL, "pipeline");
     }
 
+    std::vector<int> unused_params = params.getUnused();
+    for (int key : unused_params) {
+        if (!setProperty(key, params.get<double>(key))) {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: can't set property " << key);
+            return false;
+        }
+    }
+
+    if (pipeline)
+    {
+        VideoAccelerationType actual_va_type = VIDEO_ACCELERATION_NONE;
+        GstIterator *iter = gst_bin_iterate_recurse(GST_BIN (pipeline.get()));
+        gst_iterator_foreach(iter, find_hw_element, (gpointer)&actual_va_type);
+        gst_iterator_free(iter);
+        if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        {
+            if (va_type != actual_va_type)
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Can't select requested video acceleration through CAP_PROP_HW_ACCELERATION: "
+                        << va_type << " (actual is " << actual_va_type << "). Bailout");
+                return false;
+            }
+        }
+        else
+        {
+            va_type = actual_va_type;
+        }
+    }
+
     return true;
 }
 
@@ -991,6 +1145,8 @@ double GStreamerCapture::getProperty(int propId) const
     switch(propId)
     {
     case CV_CAP_PROP_POS_MSEC:
+        CV_LOG_ONCE_WARNING(NULL, "OpenCV | GStreamer: CAP_PROP_POS_MSEC property result may be unrealiable: "
+                                  "https://github.com/opencv/opencv/issues/19025");
         format = GST_FORMAT_TIME;
         status = gst_element_query_position(sink.get(), CV_GST_FORMAT(format), &value);
         if(!status) {
@@ -1046,6 +1202,10 @@ double GStreamerCapture::getProperty(int propId) const
             }
         }
         break;
+    case CAP_PROP_HW_ACCELERATION:
+        return static_cast<double>(va_type);
+    case CAP_PROP_HW_DEVICE:
+        return static_cast<double>(hw_device);
     case CV_CAP_GSTREAMER_QUEUE_LENGTH:
         if(!sink)
         {
@@ -1193,6 +1353,10 @@ bool GStreamerCapture::setProperty(int propId, double value)
     case CV_CAP_PROP_GAIN:
     case CV_CAP_PROP_CONVERT_RGB:
         break;
+    case cv::CAP_PROP_HW_ACCELERATION:
+        return false; // open-only
+    case cv::CAP_PROP_HW_DEVICE:
+        return false; // open-only
     case CV_CAP_GSTREAMER_QUEUE_LENGTH:
     {
         if(!sink)
@@ -1214,18 +1378,18 @@ bool GStreamerCapture::setProperty(int propId, double value)
 }
 
 
-Ptr<IVideoCapture> createGStreamerCapture_file(const String& filename)
+Ptr<IVideoCapture> createGStreamerCapture_file(const String& filename, const cv::VideoCaptureParameters& params)
 {
     Ptr<GStreamerCapture> cap = makePtr<GStreamerCapture>();
-    if (cap && cap->open(filename))
+    if (cap && cap->open(filename, params))
         return cap;
     return Ptr<IVideoCapture>();
 }
 
-Ptr<IVideoCapture> createGStreamerCapture_cam(int index)
+Ptr<IVideoCapture> createGStreamerCapture_cam(int index, const cv::VideoCaptureParameters& params)
 {
     Ptr<GStreamerCapture> cap = makePtr<GStreamerCapture>();
-    if (cap && cap->open(index))
+    if (cap && cap->open(index, params))
         return cap;
     return Ptr<IVideoCapture>();
 }
@@ -1240,8 +1404,9 @@ class CvVideoWriter_GStreamer : public CvVideoWriter
 {
 public:
     CvVideoWriter_GStreamer()
-        : input_pix_fmt(0),
-          num_frames(0), framerate(0)
+        : ipl_depth(CV_8U)
+        , input_pix_fmt(0), num_frames(0), framerate(0)
+        , va_type(VIDEO_ACCELERATION_NONE), hw_device(0)
     {
     }
     virtual ~CvVideoWriter_GStreamer() CV_OVERRIDE
@@ -1263,18 +1428,26 @@ public:
     int getCaptureDomain() const CV_OVERRIDE { return cv::CAP_GSTREAMER; }
 
     bool open(const std::string &filename, int fourcc,
-                       double fps, const Size &frameSize, bool isColor );
+              double fps, const Size &frameSize, const VideoWriterParameters& params );
     void close();
     bool writeFrame( const IplImage* image ) CV_OVERRIDE;
+
+    int getIplDepth() const { return ipl_depth; }
+
+    virtual double getProperty(int) const CV_OVERRIDE;
+
 protected:
     const char* filenameToMimetype(const char* filename);
     GSafePtr<GstElement> pipeline;
     GSafePtr<GstElement> source;
-
+    int ipl_depth;
     int input_pix_fmt;
     int num_frames;
     double framerate;
 
+    VideoAccelerationType va_type;
+    int hw_device;
+
     void close_();
 };
 
@@ -1338,6 +1511,8 @@ void CvVideoWriter_GStreamer::close()
     close_();
     source.release();
     pipeline.release();
+    va_type = VIDEO_ACCELERATION_NONE;
+    hw_device = -1;
 }
 
 /*!
@@ -1395,7 +1570,7 @@ const char* CvVideoWriter_GStreamer::filenameToMimetype(const char *filename)
  * \param fourcc desired codec fourcc
  * \param fps desired framerate
  * \param frameSize the size of the expected frames
- * \param is_color color or grayscale
+ * \param params other parameters
  * \return success
  *
  * We support 2 modes of operation. Either the user enters a filename and a fourcc
@@ -1408,13 +1583,47 @@ const char* CvVideoWriter_GStreamer::filenameToMimetype(const char *filename)
  *
  */
 bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
-                                    double fps, const cv::Size &frameSize, bool is_color )
+                                    double fps, const cv::Size &frameSize,
+                                    const VideoWriterParameters& params )
 {
     // check arguments
     CV_Assert(!filename.empty());
     CV_Assert(fps > 0);
     CV_Assert(frameSize.width > 0 && frameSize.height > 0);
 
+    const bool is_color = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
+    const int depth = params.get(VIDEOWRITER_PROP_DEPTH, CV_8U);
+
+    if (params.has(VIDEOWRITER_PROP_HW_ACCELERATION))
+    {
+        va_type = params.get<VideoAccelerationType>(VIDEOWRITER_PROP_HW_ACCELERATION);
+    }
+    if (params.has(VIDEOWRITER_PROP_HW_DEVICE))
+    {
+        hw_device = params.get<int>(VIDEOWRITER_PROP_HW_DEVICE);
+        if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+            return false;
+        }
+        if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+            return false;
+        }
+        if (hw_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: VIDEOWRITER_PROP_HW_DEVICE is not supported. Specify -1 (auto) value. Bailout");
+            return false;
+        }
+    }
+
+    if (params.warnUnusedParameters())
+    {
+        CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: unsupported parameters in VideoWriter, see logger INFO channel for details");
+        return false;
+    }
+
     // init gstreamer
     gst_initializer::init();
 
@@ -1548,6 +1757,8 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
 
     if (fourcc == CV_FOURCC('M','J','P','G') && frameSize.height == 1)
     {
+        CV_Assert(depth == CV_8U);
+        ipl_depth = IPL_DEPTH_8U;
         input_pix_fmt = GST_VIDEO_FORMAT_ENCODED;
         caps.attach(gst_caps_new_simple("image/jpeg",
                                         "framerate", GST_TYPE_FRACTION, int(fps_num), int(fps_denom),
@@ -1556,6 +1767,8 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
     }
     else if (is_color)
     {
+        CV_Assert(depth == CV_8U);
+        ipl_depth = IPL_DEPTH_8U;
         input_pix_fmt = GST_VIDEO_FORMAT_BGR;
         bufsize = frameSize.width * frameSize.height * 3;
 
@@ -1569,8 +1782,9 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
         caps.attach(gst_caps_fixate(caps.detach()));
         CV_Assert(caps);
     }
-    else
+    else if (!is_color && depth == CV_8U)
     {
+        ipl_depth = IPL_DEPTH_8U;
         input_pix_fmt = GST_VIDEO_FORMAT_GRAY8;
         bufsize = frameSize.width * frameSize.height;
 
@@ -1582,6 +1796,26 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
                                         NULL));
         caps.attach(gst_caps_fixate(caps.detach()));
     }
+    else if (!is_color && depth == CV_16U)
+    {
+        ipl_depth = IPL_DEPTH_16U;
+        input_pix_fmt = GST_VIDEO_FORMAT_GRAY16_LE;
+        bufsize = frameSize.width * frameSize.height * 2;
+
+        caps.attach(gst_caps_new_simple("video/x-raw",
+                                        "format", G_TYPE_STRING, "GRAY16_LE",
+                                        "width", G_TYPE_INT, frameSize.width,
+                                        "height", G_TYPE_INT, frameSize.height,
+                                        "framerate", GST_TYPE_FRACTION, gint(fps_num), gint(fps_denom),
+                                        NULL));
+        caps.attach(gst_caps_fixate(caps.detach()));
+    }
+    else
+    {
+        CV_WARN("unsupported depth=" << depth <<", and is_color=" << is_color << " combination");
+        pipeline.release();
+        return false;
+    }
 
     gst_app_src_set_caps(GST_APP_SRC(source.get()), caps);
     gst_app_src_set_stream_type(GST_APP_SRC(source.get()), GST_APP_STREAM_TYPE_STREAM);
@@ -1620,6 +1854,28 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
 
     handleMessage(pipeline);
 
+    if (pipeline)
+    {
+        VideoAccelerationType actual_va_type = VIDEO_ACCELERATION_NONE;
+        GstIterator *iter = gst_bin_iterate_recurse(GST_BIN (pipeline.get()));
+        gst_iterator_foreach(iter, find_hw_element, (gpointer)&actual_va_type);
+        gst_iterator_free(iter);
+        if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        {
+            if (va_type != actual_va_type)
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/GStreamer: Can't select requested VideoWriter acceleration through VIDEOWRITER_PROP_HW_ACCELERATION: "
+                        << va_type << " (actual is " << actual_va_type << "). Bailout");
+                close();
+                return false;
+            }
+        }
+        else
+        {
+            va_type = actual_va_type;
+        }
+    }
+
     return true;
 }
 
@@ -1659,6 +1915,12 @@ bool CvVideoWriter_GStreamer::writeFrame( const IplImage * image )
             return false;
         }
     }
+    else if (input_pix_fmt == GST_VIDEO_FORMAT_GRAY16_LE) {
+        if (image->nChannels != 1 || image->depth != IPL_DEPTH_16U) {
+            CV_WARN("cvWriteFrame() needs images with depth = IPL_DEPTH_16U and nChannels = 1.");
+            return false;
+        }
+    }
     else {
         CV_WARN("cvWriteFrame() needs BGR or grayscale images\n");
         return false;
@@ -1694,14 +1956,27 @@ bool CvVideoWriter_GStreamer::writeFrame( const IplImage * image )
     return true;
 }
 
+
+double CvVideoWriter_GStreamer::getProperty(int propId) const
+{
+    if (propId == VIDEOWRITER_PROP_HW_ACCELERATION)
+    {
+        return static_cast<double>(va_type);
+    }
+    else if (propId == VIDEOWRITER_PROP_HW_DEVICE)
+    {
+        return static_cast<double>(hw_device);
+    }
+    return 0;
+}
+
 Ptr<IVideoWriter> create_GStreamer_writer(const std::string& filename, int fourcc, double fps,
                                           const cv::Size& frameSize, const VideoWriterParameters& params)
 {
     CvVideoWriter_GStreamer* wrt = new CvVideoWriter_GStreamer;
-    const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
     try
     {
-        if (wrt->open(filename, fourcc, fps, frameSize, isColor))
+        if (wrt->open(filename, fourcc, fps, frameSize, params))
             return makePtr<LegacyWriter>(wrt);
         delete wrt;
     }
@@ -1803,12 +2078,21 @@ void handleMessage(GstElement * pipeline)
 
 #if defined(BUILD_PLUGIN)
 
-#include "plugin_api.hpp"
+#define CAPTURE_ABI_VERSION 1
+#define CAPTURE_API_VERSION 1
+#include "plugin_capture_api.hpp"
+#define WRITER_ABI_VERSION 1
+#define WRITER_API_VERSION 1
+#include "plugin_writer_api.hpp"
 
 namespace cv {
 
 static
-CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_OUT CvPluginCapture* handle)
+CvResult CV_API_CALL cv_capture_open_with_params(
+        const char* filename, int camera_index,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginCapture* handle
+)
 {
     if (!handle)
         return CV_ERROR_FAIL;
@@ -1818,26 +2102,38 @@ CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_
     GStreamerCapture *cap = 0;
     try
     {
+        cv::VideoCaptureParameters parameters(params, n_params);
         cap = new GStreamerCapture();
         bool res;
         if (filename)
-            res = cap->open(std::string(filename));
+            res = cap->open(std::string(filename), parameters);
         else
-            res = cap->open(camera_index);
+            res = cap->open(camera_index, parameters);
         if (res)
         {
             *handle = (CvPluginCapture)cap;
             return CV_ERROR_OK;
         }
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
     }
     if (cap)
         delete cap;
     return CV_ERROR_FAIL;
 }
 
+static
+CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_OUT CvPluginCapture* handle)
+{
+    return cv_capture_open_with_params(filename, camera_index, NULL, 0, handle);
+}
+
 static
 CvResult CV_API_CALL cv_capture_release(CvPluginCapture handle)
 {
@@ -1862,8 +2158,14 @@ CvResult CV_API_CALL cv_capture_get_prop(CvPluginCapture handle, int prop, CV_OU
         *val = instance->getProperty(prop);
         return CV_ERROR_OK;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -1878,8 +2180,14 @@ CvResult CV_API_CALL cv_capture_set_prop(CvPluginCapture handle, int prop, doubl
         GStreamerCapture* instance = (GStreamerCapture*)handle;
         return instance->setProperty(prop, val) ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -1894,14 +2202,20 @@ CvResult CV_API_CALL cv_capture_grab(CvPluginCapture handle)
         GStreamerCapture* instance = (GStreamerCapture*)handle;
         return instance->grabFrame() ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
 static
-CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_retrieve_cb_t callback, void* userdata)
+CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_capture_retrieve_cb_t callback, void* userdata)
 {
     if (!handle)
         return CV_ERROR_FAIL;
@@ -1911,38 +2225,60 @@ CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx,
         Mat img;
         // TODO: avoid unnecessary copying - implement lower level GStreamerCapture::retrieve
         if (instance->retrieveFrame(stream_idx, img))
-            return callback(stream_idx, img.data, img.step, img.cols, img.rows, img.channels(), userdata);
+            return callback(stream_idx, img.data, img.step, img.cols, img.rows, img.type(), userdata);
         return CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
 static
-CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor,
-                                    CV_OUT CvPluginWriter* handle)
+CvResult CV_API_CALL cv_writer_open_with_params(
+        const char* filename, int fourcc, double fps, int width, int height,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginWriter* handle)
 {
     CvVideoWriter_GStreamer* wrt = 0;
     try
     {
-        wrt = new CvVideoWriter_GStreamer();
         CvSize sz = { width, height };
-        if(wrt && wrt->open(filename, fourcc, fps, sz, isColor))
+        VideoWriterParameters parameters(params, n_params);
+        wrt = new CvVideoWriter_GStreamer();
+        if (wrt && wrt->open(filename, fourcc, fps, sz, parameters))
         {
             *handle = (CvPluginWriter)wrt;
             return CV_ERROR_OK;
         }
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
     }
     if (wrt)
         delete wrt;
     return CV_ERROR_FAIL;
 }
 
+static
+CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor,
+    CV_OUT CvPluginWriter* handle)
+{
+    int params[2] = { VIDEOWRITER_PROP_IS_COLOR, isColor };
+    return cv_writer_open_with_params(filename, fourcc, fps, width, height, params, 1, handle);
+}
+
 static
 CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 {
@@ -1954,11 +2290,25 @@ CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 }
 
 static
-CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter /*handle*/, int /*prop*/, CV_OUT double* /*val*/)
+CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter handle, int prop, CV_OUT double* val)
 {
-    return CV_ERROR_FAIL;
+    if (!handle)
+        return CV_ERROR_FAIL;
+    if (!val)
+        return CV_ERROR_FAIL;
+    try
+    {
+        CvVideoWriter_GStreamer* instance = (CvVideoWriter_GStreamer*)handle;
+        *val = instance->getProperty(prop);
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
 }
 
+
 static
 CvResult CV_API_CALL cv_writer_set_prop(CvPluginWriter /*handle*/, int /*prop*/, double /*val*/)
 {
@@ -1975,46 +2325,77 @@ CvResult CV_API_CALL cv_writer_write(CvPluginWriter handle, const unsigned char
         CvVideoWriter_GStreamer* instance = (CvVideoWriter_GStreamer*)handle;
         CvSize sz = { width, height };
         IplImage img;
-        cvInitImageHeader(&img, sz, IPL_DEPTH_8U, cn);
+        cvInitImageHeader(&img, sz, instance->getIplDepth(), cn);
         cvSetData(&img, const_cast<unsigned char*>(data), step);
         return instance->writeFrame(&img) ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "GStreamer: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "GStreamer: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
-static const OpenCV_VideoIO_Plugin_API_preview plugin_api_v0 =
+static const OpenCV_VideoIO_Capture_Plugin_API capture_api =
 {
     {
-        sizeof(OpenCV_VideoIO_Plugin_API_preview), ABI_VERSION, API_VERSION,
+        sizeof(OpenCV_VideoIO_Capture_Plugin_API), CAPTURE_ABI_VERSION, CAPTURE_API_VERSION,
         CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
-        "GStreamer OpenCV Video I/O plugin"
+        "GStreamer OpenCV Video I/O Capture plugin"
     },
-    /*  1*/CAP_GSTREAMER,
-    /*  2*/cv_capture_open,
-    /*  3*/cv_capture_release,
-    /*  4*/cv_capture_get_prop,
-    /*  5*/cv_capture_set_prop,
-    /*  6*/cv_capture_grab,
-    /*  7*/cv_capture_retrieve,
-    /*  8*/cv_writer_open,
-    /*  9*/cv_writer_release,
-    /* 10*/cv_writer_get_prop,
-    /* 11*/cv_writer_set_prop,
-    /* 12*/cv_writer_write
+    {
+        /*  1*/CAP_GSTREAMER,
+        /*  2*/cv_capture_open,
+        /*  3*/cv_capture_release,
+        /*  4*/cv_capture_get_prop,
+        /*  5*/cv_capture_set_prop,
+        /*  6*/cv_capture_grab,
+        /*  7*/cv_capture_retrieve,
+    },
+    {
+        /*  8*/cv_capture_open_with_params,
+    }
+};
+
+static const OpenCV_VideoIO_Writer_Plugin_API writer_api =
+{
+    {
+        sizeof(OpenCV_VideoIO_Writer_Plugin_API), WRITER_ABI_VERSION, WRITER_API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "GStreamer OpenCV Video I/O Writer plugin"
+    },
+    {
+        /*  1*/CAP_GSTREAMER,
+        /*  2*/cv_writer_open,
+        /*  3*/cv_writer_release,
+        /*  4*/cv_writer_get_prop,
+        /*  5*/cv_writer_set_prop,
+        /*  6*/cv_writer_write
+    },
+    {
+        /*  7*/cv_writer_open_with_params
+    }
 };
 
 } // namespace
 
-const OpenCV_VideoIO_Plugin_API_preview* opencv_videoio_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+const OpenCV_VideoIO_Capture_Plugin_API* opencv_videoio_capture_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
 {
-    if (requested_abi_version != 0)
-        return NULL;
-    if (requested_api_version != 0)
-        return NULL;
-    return &cv::plugin_api_v0;
+    if (requested_abi_version == CAPTURE_ABI_VERSION && requested_api_version <= CAPTURE_API_VERSION)
+        return &cv::capture_api;
+    return NULL;
+}
+
+const OpenCV_VideoIO_Writer_Plugin_API* opencv_videoio_writer_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == WRITER_ABI_VERSION && requested_api_version <= WRITER_API_VERSION)
+        return &cv::writer_api;
+    return NULL;
 }
 
 #endif // BUILD_PLUGIN
diff --git a/modules/videoio/src/cap_interface.hpp b/modules/videoio/src/cap_interface.hpp
index 5112fffe6f..eef515b2e3 100644
--- a/modules/videoio/src/cap_interface.hpp
+++ b/modules/videoio/src/cap_interface.hpp
@@ -29,6 +29,7 @@ struct CvVideoWriter
     virtual ~CvVideoWriter() {}
     virtual bool writeFrame(const IplImage*) { return false; }
     virtual int getCaptureDomain() const { return cv::CAP_ANY; } // Return the type of the capture object: CAP_FFMPEG, etc...
+    virtual double getProperty(int) const { return 0; }
 };
 
 //===================================================
@@ -52,22 +53,22 @@ inline bool castParameterTo(int paramValue)
 }
 }
 
-class VideoWriterParameters
+class VideoParameters
 {
 public:
-    struct VideoWriterParameter {
-        VideoWriterParameter() = default;
+    struct VideoParameter {
+        VideoParameter() = default;
 
-        VideoWriterParameter(int key_, int value_) : key(key_), value(value_) {}
+        VideoParameter(int key_, int value_) : key(key_), value(value_) {}
 
         int key{-1};
         int value{-1};
         mutable bool isConsumed{false};
     };
 
-    VideoWriterParameters() = default;
+    VideoParameters() = default;
 
-    explicit VideoWriterParameters(const std::vector<int>& params)
+    explicit VideoParameters(const std::vector<int>& params)
     {
         const auto count = params.size();
         if (count % 2 != 0)
@@ -82,18 +83,60 @@ public:
         }
     }
 
+    VideoParameters(int* params, unsigned n_params)
+    {
+        params_.reserve(n_params);
+        for (unsigned i = 0; i < n_params; ++i)
+        {
+            add(params[2*i], params[2*i + 1]);
+        }
+    }
+
     void add(int key, int value)
     {
         params_.emplace_back(key, value);
     }
 
-    template <class ValueType>
-    ValueType get(int key, ValueType defaultValue) const CV_NOEXCEPT
+    bool has(int key) const
     {
         auto it = std::find_if(params_.begin(), params_.end(),
-                               [key](const VideoWriterParameter &param) {
-                                   return param.key == key;
-                               });
+            [key](const VideoParameter &param)
+            {
+                return param.key == key;
+            }
+        );
+        return it != params_.end();
+    }
+
+    template <class ValueType>
+    ValueType get(int key) const
+    {
+        auto it = std::find_if(params_.begin(), params_.end(),
+            [key](const VideoParameter &param)
+            {
+                return param.key == key;
+            }
+        );
+        if (it != params_.end())
+        {
+            it->isConsumed = true;
+            return castParameterTo<ValueType>(it->value);
+        }
+        else
+        {
+            CV_Error_(Error::StsBadArg, ("Missing value for parameter: [%d]", key));
+        }
+    }
+
+    template <class ValueType>
+    ValueType get(int key, ValueType defaultValue) const
+    {
+        auto it = std::find_if(params_.begin(), params_.end(),
+            [key](const VideoParameter &param)
+            {
+                return param.key == key;
+            }
+        );
         if (it != params_.end())
         {
             it->isConsumed = true;
@@ -105,7 +148,8 @@ public:
         }
     }
 
-    std::vector<int> getUnused() const CV_NOEXCEPT {
+    std::vector<int> getUnused() const
+    {
         std::vector<int> unusedParams;
         for (const auto &param : params_)
         {
@@ -116,8 +160,53 @@ public:
         }
         return unusedParams;
     }
+
+    std::vector<int> getIntVector() const
+    {
+        std::vector<int> vint_params;
+        for (const auto& param : params_)
+        {
+            vint_params.push_back(param.key);
+            vint_params.push_back(param.value);
+        }
+        return vint_params;
+    }
+
+    bool empty() const
+    {
+        return params_.empty();
+    }
+
+    bool warnUnusedParameters() const
+    {
+        bool found = false;
+        for (const auto &param : params_)
+        {
+            if (!param.isConsumed)
+            {
+                found = true;
+                CV_LOG_INFO(NULL, "VIDEOIO: unused parameter: [" << param.key << "]=" <<
+                    cv::format("%lld / 0x%016llx", (long long)param.value, (long long)param.value));
+            }
+        }
+        return found;
+    }
+
+
 private:
-    std::vector<VideoWriterParameter> params_;
+    std::vector<VideoParameter> params_;
+};
+
+class VideoWriterParameters : public VideoParameters
+{
+public:
+    using VideoParameters::VideoParameters;  // reuse constructors
+};
+
+class VideoCaptureParameters : public VideoParameters
+{
+public:
+    using VideoParameters::VideoParameters;  // reuse constructors
 };
 
 class IVideoCapture
@@ -224,8 +313,12 @@ public:
     {
         cvReleaseVideoWriter(&writer);
     }
-    double getProperty(int) const CV_OVERRIDE
+    double getProperty(int propId) const CV_OVERRIDE
     {
+        if (writer)
+        {
+            return writer->getProperty(propId);
+        }
         return 0.;
     }
     bool setProperty(int, double) CV_OVERRIDE
@@ -249,13 +342,13 @@ public:
 
 //==================================================================================================
 
-Ptr<IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &filename);
+Ptr<IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &filename, const VideoCaptureParameters& params);
 Ptr<IVideoWriter> cvCreateVideoWriter_FFMPEG_proxy(const std::string& filename, int fourcc,
                                                    double fps, const Size& frameSize,
                                                    const VideoWriterParameters& params);
 
-Ptr<IVideoCapture> createGStreamerCapture_file(const std::string& filename);
-Ptr<IVideoCapture> createGStreamerCapture_cam(int index);
+Ptr<IVideoCapture> createGStreamerCapture_file(const std::string& filename, const cv::VideoCaptureParameters& params);
+Ptr<IVideoCapture> createGStreamerCapture_cam(int index, const cv::VideoCaptureParameters& params);
 Ptr<IVideoWriter> create_GStreamer_writer(const std::string& filename, int fourcc,
                                           double fps, const Size& frameSize,
                                           const VideoWriterParameters& params);
@@ -273,8 +366,8 @@ Ptr<IVideoWriter> create_AVFoundation_writer(const std::string& filename, int fo
 
 Ptr<IVideoCapture> create_WRT_capture(int device);
 
-Ptr<IVideoCapture> cvCreateCapture_MSMF(int index);
-Ptr<IVideoCapture> cvCreateCapture_MSMF(const std::string& filename);
+Ptr<IVideoCapture> cvCreateCapture_MSMF(int index, const VideoCaptureParameters& params);
+Ptr<IVideoCapture> cvCreateCapture_MSMF(const std::string& filename, const VideoCaptureParameters& params);
 Ptr<IVideoWriter> cvCreateVideoWriter_MSMF(const std::string& filename, int fourcc,
                                            double fps, const Size& frameSize,
                                            const VideoWriterParameters& params);
@@ -315,6 +408,7 @@ Ptr<IVideoCapture> createGPhoto2Capture(const std::string& deviceName);
 
 Ptr<IVideoCapture> createXINECapture(const std::string &filename);
 
+Ptr<IVideoCapture> createAndroidCapture_cam( int index );
 Ptr<IVideoCapture> createAndroidCapture_file(const std::string &filename);
 
 bool VideoCapture_V4L_waitAny(
@@ -322,6 +416,21 @@ bool VideoCapture_V4L_waitAny(
         CV_OUT std::vector<int>& ready,
         int64 timeoutNs);
 
+static inline
+std::ostream& operator<<(std::ostream& out, const VideoAccelerationType& va_type)
+{
+    switch (va_type)
+    {
+    case VIDEO_ACCELERATION_NONE: out << "NONE"; return out;
+    case VIDEO_ACCELERATION_ANY: out << "ANY"; return out;
+    case VIDEO_ACCELERATION_D3D11: out << "D3D11"; return out;
+    case VIDEO_ACCELERATION_VAAPI: out << "VAAPI"; return out;
+    case VIDEO_ACCELERATION_MFX: out << "MFX"; return out;
+    }
+    out << cv::format("UNKNOWN(0x%ux)", static_cast<unsigned int>(va_type));
+    return out;
+}
+
 } // cv::
 
 #endif // CAP_INTERFACE_HPP
diff --git a/modules/videoio/src/cap_mfx_plugin.cpp b/modules/videoio/src/cap_mfx_plugin.cpp
index f4e63014ec..60fa8d17ed 100644
--- a/modules/videoio/src/cap_mfx_plugin.cpp
+++ b/modules/videoio/src/cap_mfx_plugin.cpp
@@ -7,6 +7,9 @@
 #include <string>
 #include "cap_mfx_reader.hpp"
 #include "cap_mfx_writer.hpp"
+
+#define ABI_VERSION 0
+#define API_VERSION 0
 #include "plugin_api.hpp"
 
 using namespace std;
@@ -34,8 +37,13 @@ CvResult CV_API_CALL cv_capture_open(const char* filename, int, CV_OUT CvPluginC
             }
         }
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
     }
     if (cap)
         delete cap;
@@ -66,8 +74,14 @@ CvResult CV_API_CALL cv_capture_get_prop(CvPluginCapture handle, int prop, CV_OU
         *val = instance->getProperty(prop);
         return CV_ERROR_OK;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -82,8 +96,14 @@ CvResult CV_API_CALL cv_capture_set_prop(CvPluginCapture handle, int prop, doubl
         VideoCapture_IntelMFX* instance = (VideoCapture_IntelMFX*)handle;
         return instance->setProperty(prop, val) ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -98,8 +118,14 @@ CvResult CV_API_CALL cv_capture_grab(CvPluginCapture handle)
         VideoCapture_IntelMFX* instance = (VideoCapture_IntelMFX*)handle;
         return instance->grabFrame() ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -117,8 +143,14 @@ CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx,
             return callback(stream_idx, img.data, (int)img.step, img.cols, img.rows, img.channels(), userdata);
         return CV_ERROR_FAIL;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -137,8 +169,13 @@ CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps
             return CV_ERROR_OK;
         }
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
     }
     if (wrt)
         delete wrt;
@@ -179,42 +216,48 @@ CvResult CV_API_CALL cv_writer_write(CvPluginWriter handle, const unsigned char
         instance->write(img);
         return CV_ERROR_OK;
     }
-    catch(...)
+    catch (const std::exception& e)
     {
+        CV_LOG_WARNING(NULL, "MFX: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
+    catch (...)
+    {
+        CV_LOG_WARNING(NULL, "MFX: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
-static const OpenCV_VideoIO_Plugin_API_preview plugin_api_v0 =
+static const OpenCV_VideoIO_Plugin_API_preview plugin_api =
 {
     {
         sizeof(OpenCV_VideoIO_Plugin_API_preview), ABI_VERSION, API_VERSION,
         CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
         "MediaSDK OpenCV Video I/O plugin"
     },
-    /*  1*/CAP_INTEL_MFX,
-    /*  2*/cv_capture_open,
-    /*  3*/cv_capture_release,
-    /*  4*/cv_capture_get_prop,
-    /*  5*/cv_capture_set_prop,
-    /*  6*/cv_capture_grab,
-    /*  7*/cv_capture_retrieve,
-    /*  8*/cv_writer_open,
-    /*  9*/cv_writer_release,
-    /* 10*/cv_writer_get_prop,
-    /* 11*/cv_writer_set_prop,
-    /* 12*/cv_writer_write
+    {
+        /*  1*/CAP_INTEL_MFX,
+        /*  2*/cv_capture_open,
+        /*  3*/cv_capture_release,
+        /*  4*/cv_capture_get_prop,
+        /*  5*/cv_capture_set_prop,
+        /*  6*/cv_capture_grab,
+        /*  7*/cv_capture_retrieve,
+        /*  8*/cv_writer_open,
+        /*  9*/cv_writer_release,
+        /* 10*/cv_writer_get_prop,
+        /* 11*/cv_writer_set_prop,
+        /* 12*/cv_writer_write
+    }
 };
 
 } // namespace
 
 const OpenCV_VideoIO_Plugin_API_preview* opencv_videoio_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
 {
-    if (requested_abi_version != 0)
-        return NULL;
-    if (requested_api_version != 0)
-        return NULL;
-    return &cv::plugin_api_v0;
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &cv::plugin_api;
+    return NULL;
 }
 
 #endif // BUILD_PLUGIN
diff --git a/modules/videoio/src/cap_mjpeg_decoder.cpp b/modules/videoio/src/cap_mjpeg_decoder.cpp
index a3a13b848f..4db26a2e88 100644
--- a/modules/videoio/src/cap_mjpeg_decoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_decoder.cpp
@@ -116,6 +116,8 @@ double MotionJpegCapture::getProperty(int property) const
     {
         case CAP_PROP_POS_FRAMES:
             return (double)getFramePos();
+        case CAP_PROP_POS_MSEC:
+            return (double)getFramePos() * (1000. / m_fps);
         case CAP_PROP_POS_AVI_RATIO:
             return double(getFramePos())/m_mjpeg_frames.size();
         case CAP_PROP_FRAME_WIDTH:
diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index 48920ac23f..efac4093ae 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -1167,6 +1167,8 @@ public:
         fdct_qtab(_fdct_qtab),
         cat_table(_cat_table)
     {
+#if 0  // disable parallel processing due to buffer overrun bug: https://github.com/opencv/opencv/issues/19634
+
         //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
         const int min_pixels_count = 96*96;
 
@@ -1176,6 +1178,7 @@ public:
         {
             if(height*width > min_pixels_count)
             {
+                const int default_stripes_count = 4;
                 stripes_count = default_stripes_count;
             }
         }
@@ -1191,6 +1194,12 @@ public:
 
         stripes_count = std::min(stripes_count, max_stripes);
 
+#else
+        if (nstripes > 1)
+            CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/MJPEG: parallel processing is disabled: https://github.com/opencv/opencv/issues/19634");
+        stripes_count = 1;
+#endif
+
         m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
     }
 
@@ -1370,11 +1379,8 @@ private:
     const short (&fdct_qtab)[2][64];
     const uchar* cat_table;
     int stripes_count;
-    static const int default_stripes_count;
 };
 
-const int MjpegEncoder::default_stripes_count = 4;
-
 void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
 {
     //double total_cvt = 0, total_dct = 0;
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 5f7789aab3..73288c3d03 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -24,6 +24,7 @@
 #include <mfobjects.h>
 #include <tchar.h>
 #include <strsafe.h>
+#include <codecvt>
 #include <mfreadwrite.h>
 #ifdef HAVE_MSMF_DXVA
 #include <d3d11.h>
@@ -45,6 +46,7 @@
 #pragma comment(lib, "mfuuid")
 #pragma comment(lib, "Strmiids")
 #pragma comment(lib, "Mfreadwrite")
+#pragma comment(lib, "dxgi")
 #ifdef HAVE_MSMF_DXVA
 #pragma comment(lib, "d3d11")
 // MFCreateDXGIDeviceManager() is available since Win8 only.
@@ -82,6 +84,8 @@ struct IMFAttributes;
 #define CV_CAP_MODE_GRAY CV_FOURCC_MACRO('G','R','E','Y')
 #define CV_CAP_MODE_YUYV CV_FOURCC_MACRO('Y', 'U', 'Y', 'V')
 
+using namespace cv;
+
 namespace
 {
 
@@ -314,7 +318,7 @@ class SourceReaderCB : public IMFSourceReaderCallback
 {
 public:
     SourceReaderCB() :
-        m_nRefCount(0), m_hEvent(CreateEvent(NULL, FALSE, FALSE, NULL)), m_bEOS(FALSE), m_hrStatus(S_OK), m_reader(NULL), m_dwStreamIndex(0)
+        m_nRefCount(0), m_hEvent(CreateEvent(NULL, FALSE, FALSE, NULL)), m_bEOS(FALSE), m_hrStatus(S_OK), m_reader(NULL), m_dwStreamIndex(0), m_lastSampleTimestamp(0)
     {
     }
 
@@ -351,8 +355,6 @@ public:
 
     STDMETHODIMP OnReadSample(HRESULT hrStatus, DWORD dwStreamIndex, DWORD dwStreamFlags, LONGLONG llTimestamp, IMFSample *pSample) CV_OVERRIDE
     {
-        CV_UNUSED(llTimestamp);
-
         HRESULT hr = 0;
         cv::AutoLock lock(m_mutex);
 
@@ -365,6 +367,7 @@ public:
                 {
                     CV_LOG_DEBUG(NULL, "videoio(MSMF): drop frame (not processed)");
                 }
+                m_lastSampleTimestamp = llTimestamp;
                 m_lastSample = pSample;
             }
         }
@@ -444,6 +447,7 @@ public:
 
     IMFSourceReader *m_reader;
     DWORD m_dwStreamIndex;
+    LONGLONG m_lastSampleTimestamp;
     _ComPtr<IMFSample>  m_lastSample;
 };
 
@@ -576,8 +580,9 @@ public:
     } MSMFCapture_Mode;
     CvCapture_MSMF();
     virtual ~CvCapture_MSMF();
-    virtual bool open(int);
-    virtual bool open(const cv::String&);
+    bool configureHW(const cv::VideoCaptureParameters& params);
+    virtual bool open(int, const cv::VideoCaptureParameters* params);
+    virtual bool open(const cv::String&, const cv::VideoCaptureParameters* params);
     virtual void close();
     virtual double getProperty(int) const CV_OVERRIDE;
     virtual bool setProperty(int, double) CV_OVERRIDE;
@@ -597,15 +602,21 @@ protected:
     _ComPtr<IMFAttributes> getDefaultSourceConfig(UINT32 num = 10);
     bool initStream(DWORD streamID, const MediaType& mt);
 
+    bool openFinalize_(const VideoCaptureParameters* params);
+
     Media_Foundation& MF;
     cv::String filename;
     int camid;
     MSMFCapture_Mode captureMode;
+    VideoAccelerationType va_type;
+    int hwDeviceIndex;
 #ifdef HAVE_MSMF_DXVA
     _ComPtr<ID3D11Device> D3DDev;
     _ComPtr<IMFDXGIDeviceManager> D3DMgr;
 #endif
     _ComPtr<IMFSourceReader> videoFileSource;
+    _ComPtr<IMFSample> videoSample;
+    _ComPtr<IMFSourceReaderCallback> readCallback;  // non-NULL for "live" streams (camera capture)
     DWORD dwStreamIndex;
     MediaType nativeFormat;
     MediaType captureFormat;
@@ -613,10 +624,8 @@ protected:
     bool convertFormat;
     MFTIME duration;
     LONGLONG frameStep;
-    _ComPtr<IMFSample> videoSample;
     LONGLONG sampleTime;
     bool isOpen;
-    _ComPtr<IMFSourceReaderCallback> readCallback;  // non-NULL for "live" streams (camera capture)
 };
 
 CvCapture_MSMF::CvCapture_MSMF():
@@ -624,18 +633,23 @@ CvCapture_MSMF::CvCapture_MSMF():
     filename(""),
     camid(-1),
     captureMode(MODE_SW),
+    va_type(VIDEO_ACCELERATION_NONE),
+    hwDeviceIndex(-1),
 #ifdef HAVE_MSMF_DXVA
     D3DDev(NULL),
     D3DMgr(NULL),
 #endif
     videoFileSource(NULL),
     videoSample(NULL),
+    readCallback(NULL),
+    dwStreamIndex(0),
     outputFormat(CV_CAP_MODE_BGR),
     convertFormat(true),
+    duration(0),
+    frameStep(0),
     sampleTime(0),
     isOpen(false)
 {
-    configureHW(true);
 }
 
 CvCapture_MSMF::~CvCapture_MSMF()
@@ -732,10 +746,19 @@ bool CvCapture_MSMF::configureHW(bool enable)
     close();
     if (enable)
     {
+        _ComPtr<IDXGIAdapter> pAdapter;
+        if (hwDeviceIndex >= 0) {
+            _ComPtr<IDXGIFactory2> pDXGIFactory;
+            if (FAILED(CreateDXGIFactory(__uuidof(IDXGIFactory2), (void**)& pDXGIFactory)) ||
+                FAILED(pDXGIFactory->EnumAdapters(hwDeviceIndex, &pAdapter))) {
+                return false;
+            }
+        }
         D3D_FEATURE_LEVEL levels[] = { D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0,
             D3D_FEATURE_LEVEL_10_1, D3D_FEATURE_LEVEL_10_0,
             D3D_FEATURE_LEVEL_9_3,  D3D_FEATURE_LEVEL_9_2, D3D_FEATURE_LEVEL_9_1 };
-        if (SUCCEEDED(D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, D3D11_CREATE_DEVICE_BGRA_SUPPORT | D3D11_CREATE_DEVICE_VIDEO_SUPPORT,
+        D3D_DRIVER_TYPE driverType = pAdapter ? D3D_DRIVER_TYPE_UNKNOWN : D3D_DRIVER_TYPE_HARDWARE;
+        if (SUCCEEDED(D3D11CreateDevice(pAdapter.Get(), driverType, NULL, D3D11_CREATE_DEVICE_BGRA_SUPPORT | D3D11_CREATE_DEVICE_VIDEO_SUPPORT,
             levels, sizeof(levels) / sizeof(*levels), D3D11_SDK_VERSION, &D3DDev, NULL, NULL)))
         {
             // NOTE: Getting ready for multi-threaded operation
@@ -750,7 +773,23 @@ bool CvCapture_MSMF::configureHW(bool enable)
                     if (SUCCEEDED(D3DMgr->ResetDevice(D3DDev.Get(), mgrRToken)))
                     {
                         captureMode = MODE_HW;
-                        return reopen ? (prevcam >= 0 ? open(prevcam) : open(prevfile.c_str())) : true;
+                        if (hwDeviceIndex < 0)
+                            hwDeviceIndex = 0;
+                        // Log adapter description
+                        _ComPtr<IDXGIDevice> dxgiDevice;
+                        if (SUCCEEDED(D3DDev->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&dxgiDevice)))) {
+                            _ComPtr<IDXGIAdapter> adapter;
+                            if (SUCCEEDED(dxgiDevice->GetAdapter(&adapter))) {
+                                DXGI_ADAPTER_DESC desc;
+                                if (SUCCEEDED(adapter->GetDesc(&desc))) {
+                                    std::wstring name(desc.Description);
+                                    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
+                                    CV_LOG_INFO(NULL, "MSMF: Using D3D11 video acceleration on GPU device: " << conv.to_bytes(name));
+                                }
+                            }
+                        }
+                        // Reopen if needed
+                        return reopen ? (prevcam >= 0 ? open(prevcam, NULL) : open(prevfile.c_str(), NULL)) : true;
                     }
                     D3DMgr.Release();
                 }
@@ -766,13 +805,26 @@ bool CvCapture_MSMF::configureHW(bool enable)
         if (D3DDev)
             D3DDev.Release();
         captureMode = MODE_SW;
-        return reopen ? (prevcam >= 0 ? open(prevcam) : open(prevfile.c_str())) : true;
+        return reopen ? (prevcam >= 0 ? open(prevcam, NULL) : open(prevfile.c_str(), NULL)) : true;
     }
 #else
     return !enable;
 #endif
 }
 
+bool CvCapture_MSMF::configureHW(const VideoCaptureParameters& params)
+{
+    va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION, VIDEO_ACCELERATION_ANY);
+    hwDeviceIndex = params.get<int>(CAP_PROP_HW_DEVICE, -1);
+#ifndef HAVE_MSMF_DXVA
+    if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+    {
+        CV_LOG_INFO(NULL, "VIDEOIO/MSMF: MSMF backend is build without DXVA acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter: " << va_type);
+    }
+#endif
+    return configureHW(va_type == VIDEO_ACCELERATION_D3D11 || va_type == VIDEO_ACCELERATION_ANY);
+}
+
 bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat)
 {
     FormatStorage formats;
@@ -820,11 +872,17 @@ bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat)
     return initStream(dwStreamIndex, newFormat);
 }
 
-bool CvCapture_MSMF::open(int index)
+bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params)
 {
     close();
     if (index < 0)
         return false;
+
+    if (params)
+    {
+        configureHW(*params);
+    }
+
     DeviceList devices;
     UINT32 count = devices.read();
     if (count == 0 || static_cast<UINT32>(index) > count)
@@ -850,15 +908,27 @@ bool CvCapture_MSMF::open(int index)
     {
         frameStep = captureFormat.getFrameStep();
     }
+
+    if (isOpen && !openFinalize_(params))
+    {
+        close();
+        return false;
+    }
+
     return isOpen;
 }
 
-bool CvCapture_MSMF::open(const cv::String& _filename)
+bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCaptureParameters* params)
 {
     close();
     if (_filename.empty())
         return false;
 
+    if (params)
+    {
+        configureHW(*params);
+    }
+
     // Set source reader parameters
     _ComPtr<IMFAttributes> attr = getDefaultSourceConfig();
     cv::AutoBuffer<wchar_t> unicodeFileName(_filename.length() + 1);
@@ -884,9 +954,48 @@ bool CvCapture_MSMF::open(const cv::String& _filename)
         }
     }
 
+    if (isOpen && !openFinalize_(params))
+    {
+        close();
+        return false;
+    }
+
     return isOpen;
 }
 
+bool CvCapture_MSMF::openFinalize_(const VideoCaptureParameters* params)
+{
+    if (params)
+    {
+        std::vector<int> unused_params = params->getUnused();
+        for (int key : unused_params)
+        {
+            if (!setProperty(key, params->get<double>(key)))
+            {
+                CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: can't set property " << key);
+                return false;
+            }
+        }
+    }
+
+    VideoAccelerationType actual_va_type = (captureMode == MODE_HW) ? VIDEO_ACCELERATION_D3D11 : VIDEO_ACCELERATION_NONE;
+    if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+    {
+        if (va_type != actual_va_type)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: Can't select requested video acceleration through CAP_PROP_HW_ACCELERATION: "
+                    << va_type << " (actual is " << actual_va_type << "). Bailout");
+            return false;
+        }
+    }
+    else
+    {
+        va_type = actual_va_type;
+    }
+
+    return true;
+}
+
 bool CvCapture_MSMF::grabFrame()
 {
     CV_TRACE_FUNCTION();
@@ -917,6 +1026,7 @@ bool CvCapture_MSMF::grabFrame()
             CV_LOG_WARNING(NULL, "videoio(MSMF): EOS signal. Capture stream is lost");
             return false;
         }
+        sampleTime = reader->m_lastSampleTimestamp;
         return true;
     }
     else if (isOpen)
@@ -1150,7 +1260,11 @@ double CvCapture_MSMF::getProperty( int property_id ) const
         switch (property_id)
         {
         case CV_CAP_PROP_MODE:
-                return captureMode;
+            return captureMode;
+        case cv::CAP_PROP_HW_DEVICE:
+            return hwDeviceIndex;
+        case cv::CAP_PROP_HW_ACCELERATION:
+            return static_cast<double>(va_type);
         case CV_CAP_PROP_CONVERT_RGB:
                 return convertFormat ? 1 : 0;
         case CV_CAP_PROP_SAR_NUM:
@@ -1414,24 +1528,24 @@ bool CvCapture_MSMF::setProperty( int property_id, double value )
     return false;
 }
 
-cv::Ptr<cv::IVideoCapture> cv::cvCreateCapture_MSMF( int index )
+cv::Ptr<cv::IVideoCapture> cv::cvCreateCapture_MSMF( int index, const cv::VideoCaptureParameters& params)
 {
     cv::Ptr<CvCapture_MSMF> capture = cv::makePtr<CvCapture_MSMF>();
     if (capture)
     {
-        capture->open(index);
+        capture->open(index, &params);
         if (capture->isOpened())
             return capture;
     }
     return cv::Ptr<cv::IVideoCapture>();
 }
 
-cv::Ptr<cv::IVideoCapture> cv::cvCreateCapture_MSMF (const cv::String& filename)
+cv::Ptr<cv::IVideoCapture> cv::cvCreateCapture_MSMF (const cv::String& filename, const cv::VideoCaptureParameters& params)
 {
     cv::Ptr<CvCapture_MSMF> capture = cv::makePtr<CvCapture_MSMF>();
     if (capture)
     {
-        capture->open(filename);
+        capture->open(filename, &params);
         if (capture->isOpened())
             return capture;
     }
@@ -1450,17 +1564,20 @@ public:
     CvVideoWriter_MSMF();
     virtual ~CvVideoWriter_MSMF();
     virtual bool open(const cv::String& filename, int fourcc,
-                      double fps, cv::Size frameSize, bool isColor);
+                      double fps, cv::Size frameSize, const cv::VideoWriterParameters& params);
     virtual void close();
     virtual void write(cv::InputArray);
 
-    virtual double getProperty(int) const { return 0; }
+    virtual double getProperty(int) const override;
     virtual bool setProperty(int, double) { return false; }
     virtual bool isOpened() const { return initiated; }
 
     int getCaptureDomain() const CV_OVERRIDE { return cv::CAP_MSMF; }
 private:
     Media_Foundation& MF;
+    VideoAccelerationType va_type;
+    int va_device;
+
     UINT32 videoWidth;
     UINT32 videoHeight;
     double fps;
@@ -1482,6 +1599,8 @@ private:
 
 CvVideoWriter_MSMF::CvVideoWriter_MSMF():
     MF(Media_Foundation::getInstance()),
+    va_type(VIDEO_ACCELERATION_NONE),
+    va_device(-1),
     videoWidth(0),
     videoHeight(0),
     fps(0),
@@ -1555,10 +1674,40 @@ const GUID CvVideoWriter_MSMF::FourCC2GUID(int fourcc)
 }
 
 bool CvVideoWriter_MSMF::open( const cv::String& filename, int fourcc,
-                               double _fps, cv::Size _frameSize, bool /*isColor*/ )
+                               double _fps, cv::Size _frameSize, const cv::VideoWriterParameters& params)
 {
     if (initiated)
         close();
+
+    if (params.has(VIDEOWRITER_PROP_HW_ACCELERATION))
+    {
+        va_type = params.get<VideoAccelerationType>(VIDEOWRITER_PROP_HW_ACCELERATION);
+        if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: MSMF backend doesn't support writer acceleration support. Can't handle VIDEOWRITER_PROP_HW_ACCELERATION parameter. Bailout");
+            return false;
+        }
+    }
+    if (params.has(VIDEOWRITER_PROP_HW_DEVICE))
+    {
+        va_device = params.get<int>(VIDEOWRITER_PROP_HW_DEVICE);
+        if (va_type == VIDEO_ACCELERATION_NONE && va_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+            return false;
+        }
+        if (va_type == VIDEO_ACCELERATION_ANY && va_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: Invalid usage of VIDEOWRITER_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+            return false;
+        }
+        if (va_device != -1)
+        {
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: VIDEOWRITER_PROP_HW_DEVICE is not supported. Specify -1 (auto) value. Bailout");
+            return false;
+        }
+    }
+
     videoWidth = _frameSize.width;
     videoHeight = _frameSize.height;
     fps = _fps;
@@ -1607,6 +1756,23 @@ bool CvVideoWriter_MSMF::open( const cv::String& filename, int fourcc,
                 initiated = true;
                 rtStart = 0;
                 MFFrameRateToAverageTimePerFrame((UINT32)(fps * 1000), 1000, &rtDuration);
+
+                VideoAccelerationType actual_va_type = VIDEO_ACCELERATION_NONE;
+                if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+                {
+                    if (va_type != actual_va_type)
+                    {
+                        CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: Can't select requested video acceleration through VIDEOWRITER_PROP_HW_ACCELERATION: "
+                                << va_type << " (actual is " << actual_va_type << "). Bailout");
+                        close();
+                        return false;
+                    }
+                }
+                else
+                {
+                    va_type = actual_va_type;
+                }
+
                 return true;
             }
         }
@@ -1662,6 +1828,20 @@ void CvVideoWriter_MSMF::write(cv::InputArray img)
     }
 }
 
+
+double CvVideoWriter_MSMF::getProperty(int propId) const
+{
+    if (propId == VIDEOWRITER_PROP_HW_ACCELERATION)
+    {
+        return static_cast<double>(va_type);
+    }
+    else if (propId == VIDEOWRITER_PROP_HW_DEVICE)
+    {
+        return static_cast<double>(va_device);
+    }
+    return 0;
+}
+
 cv::Ptr<cv::IVideoWriter> cv::cvCreateVideoWriter_MSMF( const std::string& filename, int fourcc,
                                                         double fps, const cv::Size& frameSize,
                                                         const VideoWriterParameters& params)
@@ -1669,8 +1849,7 @@ cv::Ptr<cv::IVideoWriter> cv::cvCreateVideoWriter_MSMF( const std::string& filen
     cv::Ptr<CvVideoWriter_MSMF> writer = cv::makePtr<CvVideoWriter_MSMF>();
     if (writer)
     {
-        const bool isColor = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
-        writer->open(filename, fourcc, fps, frameSize, isColor);
+        writer->open(filename, fourcc, fps, frameSize, params);
         if (writer->isOpened())
             return writer;
     }
@@ -1679,7 +1858,20 @@ cv::Ptr<cv::IVideoWriter> cv::cvCreateVideoWriter_MSMF( const std::string& filen
 
 #if defined(BUILD_PLUGIN)
 
+#define NEW_PLUGIN
+
+#ifndef NEW_PLUGIN
+#define ABI_VERSION 0
+#define API_VERSION 0
 #include "plugin_api.hpp"
+#else
+#define CAPTURE_ABI_VERSION 1
+#define CAPTURE_API_VERSION 1
+#include "plugin_capture_api.hpp"
+#define WRITER_ABI_VERSION 1
+#define WRITER_API_VERSION 1
+#include "plugin_writer_api.hpp"
+#endif
 
 namespace cv {
 
@@ -1687,7 +1879,11 @@ typedef CvCapture_MSMF CaptureT;
 typedef CvVideoWriter_MSMF WriterT;
 
 static
-CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_OUT CvPluginCapture* handle)
+CvResult CV_API_CALL cv_capture_open_with_params(
+    const char* filename, int camera_index,
+    int* params, unsigned n_params,
+    CV_OUT CvPluginCapture* handle
+)
 {
     if (!handle)
         return CV_ERROR_FAIL;
@@ -1697,26 +1893,38 @@ CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_
     CaptureT* cap = 0;
     try
     {
+        cv::VideoCaptureParameters parameters(params, n_params);
         cap = new CaptureT();
         bool res;
         if (filename)
-            res = cap->open(std::string(filename));
+            res = cap->open(std::string(filename), &parameters);
         else
-            res = cap->open(camera_index);
+            res = cap->open(camera_index, &parameters);
         if (res)
         {
             *handle = (CvPluginCapture)cap;
             return CV_ERROR_OK;
         }
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
     }
     if (cap)
         delete cap;
     return CV_ERROR_FAIL;
 }
 
+static
+CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_OUT CvPluginCapture* handle)
+{
+    return cv_capture_open_with_params(filename, camera_index, NULL, 0, handle);
+}
+
 static
 CvResult CV_API_CALL cv_capture_release(CvPluginCapture handle)
 {
@@ -1741,8 +1949,14 @@ CvResult CV_API_CALL cv_capture_get_prop(CvPluginCapture handle, int prop, CV_OU
         *val = instance->getProperty(prop);
         return CV_ERROR_OK;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -1757,8 +1971,14 @@ CvResult CV_API_CALL cv_capture_set_prop(CvPluginCapture handle, int prop, doubl
         CaptureT* instance = (CaptureT*)handle;
         return instance->setProperty(prop, val) ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
@@ -1773,14 +1993,20 @@ CvResult CV_API_CALL cv_capture_grab(CvPluginCapture handle)
         CaptureT* instance = (CaptureT*)handle;
         return instance->grabFrame() ? CV_ERROR_OK : CV_ERROR_FAIL;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
 static
-CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_retrieve_cb_t callback, void* userdata)
+CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx, cv_videoio_capture_retrieve_cb_t callback, void* userdata)
 {
     if (!handle)
         return CV_ERROR_FAIL;
@@ -1789,37 +2015,64 @@ CvResult CV_API_CALL cv_capture_retrieve(CvPluginCapture handle, int stream_idx,
         CaptureT* instance = (CaptureT*)handle;
         Mat img;
         if (instance->retrieveFrame(stream_idx, img))
+#ifndef NEW_PLUGIN
             return callback(stream_idx, img.data, (int)img.step, img.cols, img.rows, img.channels(), userdata);
+#else
+            return callback(stream_idx, img.data, (int)img.step, img.cols, img.rows, img.type(), userdata);
+#endif
+        return CV_ERROR_FAIL;
+    }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
         return CV_ERROR_FAIL;
     }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
 static
-CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor, CV_OUT CvPluginWriter* handle)
+CvResult CV_API_CALL cv_writer_open_with_params(
+    const char* filename, int fourcc, double fps, int width, int height,
+    int* params, unsigned n_params,
+    CV_OUT CvPluginWriter* handle)
 {
     WriterT* wrt = 0;
     try
     {
+        VideoWriterParameters parameters(params, n_params);
         wrt = new WriterT();
         Size sz(width, height);
-        if (wrt && wrt->open(filename, fourcc, fps, sz, isColor != 0))
+        if (wrt && wrt->open(filename, fourcc, fps, sz, parameters))
         {
             *handle = (CvPluginWriter)wrt;
             return CV_ERROR_OK;
         }
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
     }
     if (wrt)
         delete wrt;
     return CV_ERROR_FAIL;
 }
 
+static
+CvResult CV_API_CALL cv_writer_open(const char* filename, int fourcc, double fps, int width, int height, int isColor,
+    CV_OUT CvPluginWriter* handle)
+{
+    int params[2] = { VIDEOWRITER_PROP_IS_COLOR, isColor };
+    return cv_writer_open_with_params(filename, fourcc, fps, width, height, params, 1, handle);
+}
+
 static
 CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 {
@@ -1831,9 +2084,22 @@ CvResult CV_API_CALL cv_writer_release(CvPluginWriter handle)
 }
 
 static
-CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter /*handle*/, int /*prop*/, CV_OUT double* /*val*/)
+CvResult CV_API_CALL cv_writer_get_prop(CvPluginWriter handle, int prop, CV_OUT double* val)
 {
-    return CV_ERROR_FAIL;
+    if (!handle)
+        return CV_ERROR_FAIL;
+    if (!val)
+        return CV_ERROR_FAIL;
+    try
+    {
+        WriterT* instance = (WriterT*)handle;
+        *val = instance->getProperty(prop);
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
 }
 
 static
@@ -1856,42 +2122,109 @@ CvResult CV_API_CALL cv_writer_write(CvPluginWriter handle, const unsigned char*
         instance->write(img);
         return CV_ERROR_OK;
     }
+    catch (const std::exception& e)
+    {
+        CV_LOG_WARNING(NULL, "MSMF: Exception is raised: " << e.what());
+        return CV_ERROR_FAIL;
+    }
     catch (...)
     {
+        CV_LOG_WARNING(NULL, "MSMF: Unknown C++ exception is raised");
         return CV_ERROR_FAIL;
     }
 }
 
-static const OpenCV_VideoIO_Plugin_API_preview plugin_api_v0 =
+} // namespace
+
+#ifndef NEW_PLUGIN
+
+static const OpenCV_VideoIO_Plugin_API_preview plugin_api =
 {
     {
         sizeof(OpenCV_VideoIO_Plugin_API_preview), ABI_VERSION, API_VERSION,
         CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
         "Microsoft Media Foundation OpenCV Video I/O plugin"
     },
-    /*  1*/CAP_MSMF,
-    /*  2*/cv_capture_open,
-    /*  3*/cv_capture_release,
-    /*  4*/cv_capture_get_prop,
-    /*  5*/cv_capture_set_prop,
-    /*  6*/cv_capture_grab,
-    /*  7*/cv_capture_retrieve,
-    /*  8*/cv_writer_open,
-    /*  9*/cv_writer_release,
-    /* 10*/cv_writer_get_prop,
-    /* 11*/cv_writer_set_prop,
-    /* 12*/cv_writer_write
+    {
+        /*  1*/cv::CAP_MSMF,
+        /*  2*/cv::cv_capture_open,
+        /*  3*/cv::cv_capture_release,
+        /*  4*/cv::cv_capture_get_prop,
+        /*  5*/cv::cv_capture_set_prop,
+        /*  6*/cv::cv_capture_grab,
+        /*  7*/cv::cv_capture_retrieve,
+        /*  8*/cv::cv_writer_open,
+        /*  9*/cv::cv_writer_release,
+        /* 10*/cv::cv_writer_get_prop,
+        /* 11*/cv::cv_writer_set_prop,
+        /* 12*/cv::cv_writer_write
+    }
 };
 
-} // namespace
-
 const OpenCV_VideoIO_Plugin_API_preview* opencv_videoio_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
 {
-    if (requested_abi_version != 0)
-        return NULL;
-    if (requested_api_version != 0)
-        return NULL;
-    return &cv::plugin_api_v0;
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
 }
 
+#else  // NEW_PLUGIN
+
+static const OpenCV_VideoIO_Capture_Plugin_API capture_plugin_api =
+{
+    {
+        sizeof(OpenCV_VideoIO_Capture_Plugin_API), CAPTURE_ABI_VERSION, CAPTURE_API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "Microsoft Media Foundation OpenCV Video I/O plugin"
+    },
+    {
+        /*  1*/cv::CAP_MSMF,
+        /*  2*/cv::cv_capture_open,
+        /*  3*/cv::cv_capture_release,
+        /*  4*/cv::cv_capture_get_prop,
+        /*  5*/cv::cv_capture_set_prop,
+        /*  6*/cv::cv_capture_grab,
+        /*  7*/cv::cv_capture_retrieve,
+    },
+    {
+        /*  8*/cv::cv_capture_open_with_params,
+    }
+};
+
+const OpenCV_VideoIO_Capture_Plugin_API* opencv_videoio_capture_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == CAPTURE_ABI_VERSION && requested_api_version <= CAPTURE_API_VERSION)
+        return &capture_plugin_api;
+    return NULL;
+}
+
+static const OpenCV_VideoIO_Writer_Plugin_API writer_plugin_api =
+{
+    {
+        sizeof(OpenCV_VideoIO_Writer_Plugin_API), WRITER_ABI_VERSION, WRITER_API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "Microsoft Media Foundation OpenCV Video I/O plugin"
+    },
+    {
+        /*  1*/cv::CAP_MSMF,
+        /*  2*/cv::cv_writer_open,
+        /*  3*/cv::cv_writer_release,
+        /*  4*/cv::cv_writer_get_prop,
+        /*  5*/cv::cv_writer_set_prop,
+        /*  6*/cv::cv_writer_write
+    },
+    {
+        /*  7*/cv::cv_writer_open_with_params
+    }
+};
+
+const OpenCV_VideoIO_Writer_Plugin_API* opencv_videoio_writer_plugin_init_v1(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == WRITER_ABI_VERSION && requested_api_version <= WRITER_API_VERSION)
+        return &writer_plugin_api;
+    return NULL;
+}
+
+#endif  // NEW_PLUGIN
+
 #endif // BUILD_PLUGIN
diff --git a/modules/videoio/src/cap_ueye.cpp b/modules/videoio/src/cap_ueye.cpp
index 3912da52bc..eadba15546 100644
--- a/modules/videoio/src/cap_ueye.cpp
+++ b/modules/videoio/src/cap_ueye.cpp
@@ -356,6 +356,8 @@ void VideoCapture_uEye::unlock_image_buffer()
 // plugin glue
 #if defined(BUILD_PLUGIN)
 
+#define ABI_VERSION 0
+#define API_VERSION 0
 #include "plugin_api.hpp"
 
 namespace cv
@@ -464,36 +466,36 @@ CvResult CV_API_CALL cv_writer_write(CvPluginWriter /*handle*/, const unsigned c
     return CV_ERROR_FAIL;
 }
 
-const OpenCV_VideoIO_Plugin_API_preview plugin_api_v0 =
+const OpenCV_VideoIO_Plugin_API_preview plugin_api =
 {
     {
         sizeof(OpenCV_VideoIO_Plugin_API_preview), ABI_VERSION, API_VERSION,
         CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
         "uEye OpenCV Video I/O plugin"
     },
-    /*  1*/CAP_UEYE,
-    /*  2*/cv_capture_open,
-    /*  3*/cv_capture_release,
-    /*  4*/cv_capture_get_prop,
-    /*  5*/cv_capture_set_prop,
-    /*  6*/cv_capture_grab,
-    /*  7*/cv_capture_retrieve,
-    /*  8*/cv_writer_open,
-    /*  9*/cv_writer_release,
-    /* 10*/cv_writer_get_prop,
-    /* 11*/cv_writer_set_prop,
-    /* 12*/cv_writer_write
+    {
+        /*  1*/CAP_UEYE,
+        /*  2*/cv_capture_open,
+        /*  3*/cv_capture_release,
+        /*  4*/cv_capture_get_prop,
+        /*  5*/cv_capture_set_prop,
+        /*  6*/cv_capture_grab,
+        /*  7*/cv_capture_retrieve,
+        /*  8*/cv_writer_open,
+        /*  9*/cv_writer_release,
+        /* 10*/cv_writer_get_prop,
+        /* 11*/cv_writer_set_prop,
+        /* 12*/cv_writer_write
+    }
 };
 } // namespace
 } // namespace cv
 
 const OpenCV_VideoIO_Plugin_API_preview* opencv_videoio_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
 {
-    if (requested_abi_version != 0)
-        return NULL;
-    if (requested_api_version != 0)
-        return NULL;
-    return &cv::plugin_api_v0;
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &cv::plugin_api;
+    return NULL;
 }
 
 #endif // BUILD_PLUGIN
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 3f37a97b92..739c30a51f 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -14,9 +14,6 @@ It has been tested with the motempl sample program
 First Patch:  August 24, 2004 Travis Wood   TravisOCV@tkwood.com
 For Release:  OpenCV-Linux Beta4  opencv-0.9.6
 Tested On:    LMLBT44 with 8 video inputs
-Problems?     Post your questions at answers.opencv.org,
-              Report bugs at code.opencv.org,
-              Submit your fixes at https://github.com/opencv/opencv/
 Patched Comments:
 
 TW: The cv cam utils that came with the initial release of OpenCV for LINUX Beta4
diff --git a/modules/videoio/src/plugin_api.hpp b/modules/videoio/src/plugin_api.hpp
index 957eda053c..b46009a4aa 100644
--- a/modules/videoio/src/plugin_api.hpp
+++ b/modules/videoio/src/plugin_api.hpp
@@ -2,19 +2,38 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+//
+// DEPRECATED. Do not use in new plugins
+//
+
 #ifndef PLUGIN_API_HPP
 #define PLUGIN_API_HPP
 
 #include <opencv2/core/cvdef.h>
 #include <opencv2/core/llapi/llapi.h>
 
-// increase for backward-compatible changes, e.g. add new function
-// Main API <= Plugin API -> plugin is compatible
-#define API_VERSION 0 // preview
-// increase for incompatible changes, e.g. remove function argument
-// Main ABI == Plugin ABI -> plugin is compatible
+#if !defined(BUILD_PLUGIN)
+
+/// increased for backward-compatible changes, e.g. add new function
+/// Caller API <= Plugin API -> plugin is fully compatible
+/// Caller API > Plugin API -> plugin is not fully compatible, caller should use extra checks to use plugins with older API
+#define API_VERSION 1 // preview
+
+/// increased for incompatible changes, e.g. remove function argument
+/// Caller ABI == Plugin ABI -> plugin is compatible
+/// Caller ABI > Plugin ABI -> plugin is not compatible, caller should use shim code to use old ABI plugins (caller may know how lower ABI works, so it is possible)
+/// Caller ABI < Plugin ABI -> plugin can't be used (plugin should provide interface with lower ABI to handle that)
 #define ABI_VERSION 0 // preview
 
+#else // !defined(BUILD_PLUGIN)
+
+#if !defined(ABI_VERSION) || !defined(API_VERSION)
+#error "Plugin must define ABI_VERSION and API_VERSION before including plugin_api.hpp"
+#endif
+
+#endif // !defined(BUILD_PLUGIN)
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -24,10 +43,8 @@ typedef CvResult (CV_API_CALL *cv_videoio_retrieve_cb_t)(int stream_idx, unsigne
 typedef struct CvPluginCapture_t* CvPluginCapture;
 typedef struct CvPluginWriter_t* CvPluginWriter;
 
-typedef struct OpenCV_VideoIO_Plugin_API_preview
+struct OpenCV_VideoIO_Plugin_API_v0_0_api_entries
 {
-    OpenCV_API_Header api_header;
-
     /** OpenCV capture ID (VideoCaptureAPIs)
     @note API-ENTRY 1, API-Version == 0
      */
@@ -93,8 +110,12 @@ typedef struct OpenCV_VideoIO_Plugin_API_preview
 
     /** @brief Try to open video writer
 
-    @param filename File name or NULL to use camera_index instead
-    @param camera_index Camera index (used if filename == NULL)
+    @param filename Destination location
+    @param fourcc FOURCC code
+    @param fps FPS
+    @param width frame width
+    @param height frame height
+    @param isColor true if video stream should save color frames
     @param[out] handle pointer on Writer handle
 
     @note API-CALL 8, API-Version == 0
@@ -112,7 +133,7 @@ typedef struct OpenCV_VideoIO_Plugin_API_preview
 
     /** @brief Get property value
 
-    @param handle Capture handle
+    @param handle Writer handle
     @param prop Property index
     @param[out] val property value
 
@@ -122,7 +143,7 @@ typedef struct OpenCV_VideoIO_Plugin_API_preview
 
     /** @brief Set property value
 
-    @param handle Capture handle
+    @param handle Writer handle
     @param prop Property index
     @param val property value
 
@@ -132,8 +153,8 @@ typedef struct OpenCV_VideoIO_Plugin_API_preview
 
     /** @brief Write frame
 
-    @param handle Capture handle
-    @param data Capture handle
+    @param handle Writer handle
+    @param data frame data
     @param step step in bytes
     @param width frame width in pixels
     @param height frame height
@@ -142,19 +163,54 @@ typedef struct OpenCV_VideoIO_Plugin_API_preview
     @note API-CALL 12, API-Version == 0
      */
     CvResult (CV_API_CALL *Writer_write)(CvPluginWriter handle, const unsigned char *data, int step, int width, int height, int cn);
+}; // OpenCV_VideoIO_Plugin_API_v0_0_api_entries
 
-} OpenCV_VideoIO_Plugin_API_preview;
+struct OpenCV_VideoIO_Plugin_API_v0_1_api_entries
+{
+    /** @brief Try to open video writer
+
+    @param filename Destination location
+    @param fourcc FOURCC code
+    @param fps FPS
+    @param width frame width
+    @param height frame height
+    @param params pointer on 2*n_params array of 'key,value' pairs
+    @param n_params number of passed parameters
+    @param[out] handle pointer on Writer handle
+
+    @note API-CALL 13, API-Version == 1
+     */
+    CvResult (CV_API_CALL* Writer_open_with_params)(
+        const char* filename, int fourcc, double fps, int width, int height,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginWriter* handle
+    );
+}; // OpenCV_VideoIO_Plugin_API_v0_1_api_entries
+
+typedef struct OpenCV_VideoIO_Plugin_API_preview_v0
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Plugin_API_v0_0_api_entries v0;
+} OpenCV_VideoIO_Plugin_API_preview_v0;
+
+typedef struct OpenCV_VideoIO_Plugin_API_preview_v1
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Plugin_API_v0_0_api_entries v0;
+    struct OpenCV_VideoIO_Plugin_API_v0_1_api_entries v1;
+} OpenCV_VideoIO_Plugin_API_preview_v1;
+
+
+#if ABI_VERSION == 0 && API_VERSION == 1
+typedef struct OpenCV_VideoIO_Plugin_API_preview_v1 OpenCV_VideoIO_Plugin_API_preview;
+#elif ABI_VERSION == 0 && API_VERSION == 0
+typedef struct OpenCV_VideoIO_Plugin_API_preview_v0 OpenCV_VideoIO_Plugin_API_preview;
+#else
+#error "Not supported configuration: check ABI_VERSION/API_VERSION"
+#endif
 
 #ifdef BUILD_PLUGIN
 
-#ifndef CV_PLUGIN_EXPORTS
-#if (defined _WIN32 || defined WINCE || defined __CYGWIN__)
-#  define CV_PLUGIN_EXPORTS __declspec(dllexport)
-#elif defined __GNUC__ && __GNUC__ >= 4
-#  define CV_PLUGIN_EXPORTS __attribute__ ((visibility ("default")))
-#endif
-#endif
-
 CV_PLUGIN_EXPORTS
 const OpenCV_VideoIO_Plugin_API_preview* CV_API_CALL opencv_videoio_plugin_init_v0
         (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/) CV_NOEXCEPT;
diff --git a/modules/videoio/src/plugin_capture_api.hpp b/modules/videoio/src/plugin_capture_api.hpp
new file mode 100644
index 0000000000..8f33d40219
--- /dev/null
+++ b/modules/videoio/src/plugin_capture_api.hpp
@@ -0,0 +1,161 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef PLUGIN_CAPTURE_API_HPP
+#define PLUGIN_CAPTURE_API_HPP
+
+#include <opencv2/core/cvdef.h>
+#include <opencv2/core/llapi/llapi.h>
+
+#if !defined(BUILD_PLUGIN)
+
+/// increased for backward-compatible changes, e.g. add new function
+/// Caller API <= Plugin API -> plugin is fully compatible
+/// Caller API > Plugin API -> plugin is not fully compatible, caller should use extra checks to use plugins with older API
+#define CAPTURE_API_VERSION 1
+
+/// increased for incompatible changes, e.g. remove function argument
+/// Caller ABI == Plugin ABI -> plugin is compatible
+/// Caller ABI > Plugin ABI -> plugin is not compatible, caller should use shim code to use old ABI plugins (caller may know how lower ABI works, so it is possible)
+/// Caller ABI < Plugin ABI -> plugin can't be used (plugin should provide interface with lower ABI to handle that)
+#define CAPTURE_ABI_VERSION 1
+
+#else // !defined(BUILD_PLUGIN)
+
+#if !defined(CAPTURE_ABI_VERSION) || !defined(CAPTURE_API_VERSION)
+#error "Plugin must define CAPTURE_ABI_VERSION and CAPTURE_API_VERSION before including plugin_capture_api.hpp"
+#endif
+
+#endif // !defined(BUILD_PLUGIN)
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef CvResult (CV_API_CALL *cv_videoio_capture_retrieve_cb_t)(int stream_idx, unsigned const char* data, int step, int width, int height, int type, void* userdata);
+
+typedef struct CvPluginCapture_t* CvPluginCapture;
+
+struct OpenCV_VideoIO_Capture_Plugin_API_v1_0_api_entries
+{
+    /** OpenCV capture ID (VideoCaptureAPIs)
+    @note API-ENTRY 1, API-Version == 0
+     */
+    int id;
+
+    /** @brief Open video capture
+
+    @param filename File name or NULL to use camera_index instead
+    @param camera_index Camera index (used if filename == NULL)
+    @param[out] handle pointer on Capture handle
+
+    @note API-CALL 2, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_open)(const char* filename, int camera_index, CV_OUT CvPluginCapture* handle);
+
+    /** @brief Release Capture handle
+
+    @param handle Capture handle
+
+    @note API-CALL 3, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_release)(CvPluginCapture handle);
+
+    /** @brief Get property value
+
+    @param handle Capture handle
+    @param prop Property index
+    @param[out] val property value
+
+    @note API-CALL 4, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_getProperty)(CvPluginCapture handle, int prop, CV_OUT double* val);
+
+    /** @brief Set property value
+
+    @param handle Capture handle
+    @param prop Property index
+    @param val property value
+
+    @note API-CALL 5, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_setProperty)(CvPluginCapture handle, int prop, double val);
+
+    /** @brief Grab frame
+
+    @param handle Capture handle
+
+    @note API-CALL 6, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_grab)(CvPluginCapture handle);
+
+    /** @brief Retrieve frame
+
+    @param handle Capture handle
+    @param stream_idx stream index to retrieve (BGR/IR/depth data)
+    @param callback retrieve callback (synchronous)
+    @param userdata callback context data
+
+    @note API-CALL 7, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Capture_retreive)(CvPluginCapture handle, int stream_idx, cv_videoio_capture_retrieve_cb_t callback, void* userdata);
+}; // OpenCV_VideoIO_Capture_Plugin_API_v1_0_api_entries
+
+struct OpenCV_VideoIO_Capture_Plugin_API_v1_1_api_entries
+{
+    /** @brief Open video capture with parameters
+
+    @param filename File name or NULL to use camera_index instead
+    @param camera_index Camera index (used if filename == NULL)
+    @param params pointer on 2*n_params array of 'key,value' pairs
+    @param n_params number of passed parameters
+    @param[out] handle pointer on Capture handle
+
+    @note API-CALL 8, API-Version == 1
+     */
+    CvResult (CV_API_CALL *Capture_open_with_params)(
+        const char* filename, int camera_index,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginCapture* handle);
+}; // OpenCV_VideoIO_Capture_Plugin_API_v1_1_api_entries
+
+typedef struct OpenCV_VideoIO_Capture_Plugin_API_v1_0
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Capture_Plugin_API_v1_0_api_entries v0;
+} OpenCV_VideoIO_Capture_Plugin_API_v1_0;
+
+typedef struct OpenCV_VideoIO_Capture_Plugin_API_v1_1
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Capture_Plugin_API_v1_0_api_entries v0;
+    struct OpenCV_VideoIO_Capture_Plugin_API_v1_1_api_entries v1;
+} OpenCV_VideoIO_Capture_Plugin_API_v1_1;
+
+#if CAPTURE_ABI_VERSION == 1 && CAPTURE_API_VERSION == 1
+typedef struct OpenCV_VideoIO_Capture_Plugin_API_v1_1 OpenCV_VideoIO_Capture_Plugin_API;
+#elif CAPTURE_ABI_VERSION == 1 && CAPTURE_API_VERSION == 0
+typedef struct OpenCV_VideoIO_Capture_Plugin_API_v1_0 OpenCV_VideoIO_Capture_Plugin_API;
+#else
+#error "Not supported configuration: check CAPTURE_ABI_VERSION/CAPTURE_API_VERSION"
+#endif
+
+#ifdef BUILD_PLUGIN
+
+CV_PLUGIN_EXPORTS
+const OpenCV_VideoIO_Capture_Plugin_API* CV_API_CALL opencv_videoio_capture_plugin_init_v1
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/) CV_NOEXCEPT;
+
+#else  // BUILD_PLUGIN
+typedef const OpenCV_VideoIO_Capture_Plugin_API* (CV_API_CALL *FN_opencv_videoio_capture_plugin_init_t)
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/);
+#endif  // BUILD_PLUGIN
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PLUGIN_CAPTURE_API_HPP
diff --git a/modules/videoio/src/plugin_writer_api.hpp b/modules/videoio/src/plugin_writer_api.hpp
new file mode 100644
index 0000000000..50896eba51
--- /dev/null
+++ b/modules/videoio/src/plugin_writer_api.hpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef PLUGIN_WRITER_API_HPP
+#define PLUGIN_WRITER_API_HPP
+
+#include <opencv2/core/cvdef.h>
+#include <opencv2/core/llapi/llapi.h>
+
+#if !defined(BUILD_PLUGIN)
+
+/// increased for backward-compatible changes, e.g. add new function
+/// Caller API <= Plugin API -> plugin is fully compatible
+/// Caller API > Plugin API -> plugin is not fully compatible, caller should use extra checks to use plugins with older API
+#define WRITER_API_VERSION 1
+
+/// increased for incompatible changes, e.g. remove function argument
+/// Caller ABI == Plugin ABI -> plugin is compatible
+/// Caller ABI > Plugin ABI -> plugin is not compatible, caller should use shim code to use old ABI plugins (caller may know how lower ABI works, so it is possible)
+/// Caller ABI < Plugin ABI -> plugin can't be used (plugin should provide interface with lower ABI to handle that)
+#define WRITER_ABI_VERSION 1
+
+#else // !defined(BUILD_PLUGIN)
+
+#if !defined(WRITER_ABI_VERSION) || !defined(WRITER_API_VERSION)
+#error "Plugin must define WRITER_ABI_VERSION and WRITER_API_VERSION before including plugin_writer_api.hpp"
+#endif
+
+#endif // !defined(BUILD_PLUGIN)
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct CvPluginWriter_t* CvPluginWriter;
+
+struct OpenCV_VideoIO_Writer_Plugin_API_v1_0_api_entries
+{
+    /** OpenCV capture ID (VideoCaptureAPIs)
+    @note API-ENTRY 1, API-Version == 0
+     */
+    int id;
+
+    /** @brief Try to open video writer
+
+    @param filename Destination location
+    @param fourcc FOURCC code
+    @param fps FPS
+    @param width frame width
+    @param height frame height
+    @param isColor true if video stream should save color frames
+    @param[out] handle pointer on Writer handle
+
+    @note API-CALL 2, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Writer_open)(const char* filename, int fourcc, double fps, int width, int height, int isColor,
+                                         CV_OUT CvPluginWriter* handle);
+
+    /** @brief Release Writer handle
+
+    @param handle Writer handle
+
+    @note API-CALL 3, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Writer_release)(CvPluginWriter handle);
+
+    /** @brief Get property value
+
+    @param handle Writer handle
+    @param prop Property index
+    @param[out] val property value
+
+    @note API-CALL 4, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Writer_getProperty)(CvPluginWriter handle, int prop, CV_OUT double* val);
+
+    /** @brief Set property value
+
+    @param handle Writer handle
+    @param prop Property index
+    @param val property value
+
+    @note API-CALL 5, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Writer_setProperty)(CvPluginWriter handle, int prop, double val);
+
+    /** @brief Write frame
+
+    @param handle Writer handle
+    @param data frame data
+    @param step step in bytes
+    @param width frame width in pixels
+    @param height frame height
+    @param cn number of channels per pixel
+
+    @note API-CALL 6, API-Version == 0
+     */
+    CvResult (CV_API_CALL *Writer_write)(CvPluginWriter handle, const unsigned char *data, int step, int width, int height, int cn);
+}; // OpenCV_VideoIO_Writer_Plugin_API_v1_0_api_entries
+
+struct OpenCV_VideoIO_Writer_Plugin_API_v1_1_api_entries
+{
+    /** @brief Try to open video writer
+
+    @param filename Destination location
+    @param fourcc FOURCC code
+    @param fps FPS
+    @param width frame width
+    @param height frame height
+    @param params pointer on 2*n_params array of 'key,value' pairs
+    @param n_params number of passed parameters
+    @param[out] handle pointer on Writer handle
+
+    @note API-CALL 7, API-Version == 1
+     */
+    CvResult (CV_API_CALL* Writer_open_with_params)(
+        const char* filename, int fourcc, double fps, int width, int height,
+        int* params, unsigned n_params,
+        CV_OUT CvPluginWriter* handle
+    );
+}; // OpenCV_VideoIO_Writer_Plugin_API_v1_1_api_entries
+
+typedef struct OpenCV_VideoIO_Writer_Plugin_API_v1_0
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Writer_Plugin_API_v1_0_api_entries v0;
+} OpenCV_VideoIO_Writer_Plugin_API_v1_0;
+
+typedef struct OpenCV_VideoIO_Writer_Plugin_API_v1_1
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_VideoIO_Writer_Plugin_API_v1_0_api_entries v0;
+    struct OpenCV_VideoIO_Writer_Plugin_API_v1_1_api_entries v1;
+} OpenCV_VideoIO_Writer_Plugin_API_v1_1;
+
+
+#if WRITER_ABI_VERSION == 1 && WRITER_API_VERSION == 1
+typedef struct OpenCV_VideoIO_Writer_Plugin_API_v1_1 OpenCV_VideoIO_Writer_Plugin_API;
+#elif WRITER_ABI_VERSION == 1 && WRITER_API_VERSION == 0
+typedef struct OpenCV_VideoIO_Writer_Plugin_API_v1_0 OpenCV_VideoIO_Writer_Plugin_API;
+#else
+#error "Not supported configuration: check WRITER_ABI_VERSION/WRITER_API_VERSION"
+#endif
+
+#ifdef BUILD_PLUGIN
+
+CV_PLUGIN_EXPORTS
+const OpenCV_VideoIO_Writer_Plugin_API* CV_API_CALL opencv_videoio_writer_plugin_init_v1
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/) CV_NOEXCEPT;
+
+#else  // BUILD_PLUGIN
+typedef const OpenCV_VideoIO_Writer_Plugin_API* (CV_API_CALL *FN_opencv_videoio_writer_plugin_init_t)
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/);
+#endif  // BUILD_PLUGIN
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PLUGIN_WRITER_API_HPP
diff --git a/modules/videoio/src/precomp.hpp b/modules/videoio/src/precomp.hpp
index eeea218373..65f5da11dd 100644
--- a/modules/videoio/src/precomp.hpp
+++ b/modules/videoio/src/precomp.hpp
@@ -44,6 +44,7 @@
 
 #if defined(__OPENCV_BUILD) && defined(BUILD_PLUGIN)
 #undef __OPENCV_BUILD  // allow public API only
+#define OPENCV_HAVE_CVCONFIG_H 1  // but we still have access to cvconfig.h (TODO remove this)
 #include <opencv2/core.hpp>
 #include <opencv2/core/utils/trace.hpp>
 #endif
diff --git a/modules/videoio/src/videoio_registry.cpp b/modules/videoio/src/videoio_registry.cpp
index 3ee1bab822..59d96d162c 100644
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@@ -16,8 +16,6 @@
 #include "cap_mfx_writer.hpp"
 #endif
 
-#include "plugin_api.hpp"
-
 // All WinRT versions older than 8.0 should provide classes used for video support
 #if defined(WINRT) && !defined(WINRT_8_0) && defined(__cplusplus_winrt)
 #   include "cap_winrt_capture.hpp"
@@ -142,8 +140,30 @@ static const struct VideoBackendInfo builtin_backends[] =
 #ifdef HAVE_XINE
     DECLARE_STATIC_BACKEND(CAP_XINE, "XINE", MODE_CAPTURE_BY_FILENAME, createXINECapture, 0, 0),
 #endif
+#if defined(HAVE_ANDROID_MEDIANDK) || defined(HAVE_ANDROID_NATIVE_CAMERA)
+    DECLARE_STATIC_BACKEND(CAP_ANDROID, "ANDROID_NATIVE",
 #ifdef HAVE_ANDROID_MEDIANDK
-    DECLARE_STATIC_BACKEND(CAP_ANDROID, "ANDROID_MEDIANDK", MODE_CAPTURE_BY_FILENAME, createAndroidCapture_file, 0, 0),
+                           MODE_CAPTURE_BY_FILENAME
+#else
+                           0
+#endif
+                           |
+#ifdef HAVE_ANDROID_NATIVE_CAMERA
+                           MODE_CAPTURE_BY_INDEX,
+#else
+                           0,
+#endif
+#ifdef HAVE_ANDROID_MEDIANDK
+                           createAndroidCapture_file,
+#else
+                           0,
+#endif
+#ifdef HAVE_ANDROID_NATIVE_CAMERA
+                           createAndroidCapture_cam,
+#else
+                           0,
+#endif
+                           0),
 #endif
     // dropped backends: MIL, TYZX
 };
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index 623ce29f70..c8a5598dcc 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -12,21 +12,51 @@
 
 namespace opencv_test { namespace {
 
-static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100, Mat* lastFrame = NULL)
+static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100, Mat* lastFrame = NULL, bool testTimestamps = true)
 {
     Mat frame;
     int64 time0 = cv::getTickCount();
+    int64 sysTimePrev = time0;
+    const double cvTickFreq = cv::getTickFrequency();
+
+    double camTimePrev = 0.0;
+    const double fps = capture.get(cv::CAP_PROP_FPS);
+    const double framePeriod = fps == 0.0 ? 1. : 1.0 / fps;
+
+    const bool validTickAndFps = cvTickFreq != 0 && fps != 0.;
+    testTimestamps &= validTickAndFps;
+
     for (int i = 0; i < N; i++)
     {
         SCOPED_TRACE(cv::format("frame=%d", i));
 
         capture >> frame;
+        const int64 sysTimeCurr = cv::getTickCount();
+        const double camTimeCurr = capture.get(cv::CAP_PROP_POS_MSEC);
         ASSERT_FALSE(frame.empty());
 
+        // Do we have a previous frame?
+        if (i > 0 && testTimestamps)
+        {
+            const double sysTimeElapsedSecs = (sysTimeCurr - sysTimePrev) / cvTickFreq;
+            const double camTimeElapsedSecs = (camTimeCurr - camTimePrev) / 1000.;
+
+            // Check that the time between two camera frames and two system time calls
+            // are within 1.5 frame periods of one another.
+            //
+            // 1.5x is chosen to accomodate for a dropped frame, and an additional 50%
+            // to account for drift in the scale of the camera and system time domains.
+            EXPECT_NEAR(sysTimeElapsedSecs, camTimeElapsedSecs, framePeriod * 1.5);
+        }
+
         EXPECT_GT(cvtest::norm(frame, NORM_INF), 0) << "Complete black image has been received";
+
+        sysTimePrev = sysTimeCurr;
+        camTimePrev = camTimeCurr;
     }
+
     int64 time1 = cv::getTickCount();
-    printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
+    printf("Processed %d frames on %.2f FPS\n", N, (N * cvTickFreq) / (time1 - time0 + 1));
     if (lastFrame) *lastFrame = frame.clone();
 }
 
@@ -42,6 +72,27 @@ TEST(DISABLED_videoio_camera, basic)
     capture.release();
 }
 
+// Test that CAP_PROP_CONVERT_RGB remain to false (default is true) after other supported property are set.
+// The test use odd value to be almost sure to trigger code responsible for recreating the device.
+TEST(DISABLED_videoio_camera, dshow_convert_rgb_persistency)
+{
+    VideoCapture capture(CAP_DSHOW);
+    ASSERT_TRUE(capture.isOpened());
+    ASSERT_TRUE(capture.set(CAP_PROP_CONVERT_RGB, 0));
+    ASSERT_DOUBLE_EQ(capture.get(CAP_PROP_CONVERT_RGB), 0);
+    capture.set(CAP_PROP_FRAME_WIDTH, 641);
+    capture.set(CAP_PROP_FRAME_HEIGHT, 481);
+    capture.set(CAP_PROP_FPS, 31);
+    capture.set(CAP_PROP_CHANNEL, 1);
+    capture.set(cv::CAP_PROP_FOURCC, cv::VideoWriter::fourcc('Y', '1', '6', ' '));
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    ASSERT_DOUBLE_EQ(capture.get(CAP_PROP_CONVERT_RGB), 0);
+    capture.release();
+}
+
 TEST(DISABLED_videoio_camera, v4l_read_mjpg)
 {
     VideoCapture capture(CAP_V4L2);
@@ -57,6 +108,41 @@ TEST(DISABLED_videoio_camera, v4l_read_mjpg)
     capture.release();
 }
 
+TEST(DISABLED_videoio_camera, v4l_open_mjpg)
+{
+    VideoCapture capture;
+    capture.open(0, CAP_V4L2, {
+        CAP_PROP_FOURCC, VideoWriter::fourcc('M', 'J', 'P', 'G')
+    });
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    test_readFrames(capture);
+    capture.release();
+}
+
+TEST(DISABLED_videoio_camera, v4l_open_mjpg_1280x720)
+{
+    VideoCapture capture(0, CAP_V4L2, {
+        CAP_PROP_FOURCC, VideoWriter::fourcc('M', 'J', 'P', 'G'),
+        CAP_PROP_FRAME_WIDTH, 1280,
+        CAP_PROP_FRAME_HEIGHT, 720,
+    });
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    test_readFrames(capture);
+    capture.release();
+}
+
 //Following test if for capture device using PhysConn_Video_SerialDigital as crossbar input pin
 TEST(DISABLED_videoio_camera, channel6)
 {
diff --git a/modules/videoio/test/test_dynamic.cpp b/modules/videoio/test/test_dynamic.cpp
index dc787e80ab..bad6257394 100644
--- a/modules/videoio/test/test_dynamic.cpp
+++ b/modules/videoio/test/test_dynamic.cpp
@@ -52,6 +52,7 @@ TEST(videoio_dynamic, basic_write)
         vector<VideoCaptureAPIs> backends = videoio_registry::getStreamBackends();
         for (VideoCaptureAPIs be : backends)
         {
+            std::string backend_name = cv::videoio_registry::getBackendName(be);
             VideoCapture cap;
             cap.open(filename, be);
             if(cap.isOpened())
@@ -70,7 +71,12 @@ TEST(videoio_dynamic, basic_write)
                     }
                     break;
                 }
-                EXPECT_EQ(count, FRAME_COUNT);
+                if (be == CAP_AVFOUNDATION)
+                {
+                    if (FRAME_COUNT != count)  // OpenCV 5.0: 5 vs 120
+                        throw SkipTestException(backend_name + ": invalid number of frames");
+                }
+                EXPECT_EQ(FRAME_COUNT, count) << backend_name;
                 cap.release();
             }
             EXPECT_FALSE(cap.isOpened());
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 4bda7f37a8..52ec4c751d 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -459,4 +459,61 @@ TEST(videoio, mp4_orientation_no_rotation)
     ASSERT_EQ(384, frame.rows);
 }
 
+
+static void ffmpeg_check_read_raw(VideoCapture& cap)
+{
+    ASSERT_TRUE(cap.isOpened()) << "Can't open the video";
+
+    Mat data;
+    cap >> data;
+    EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
+    EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
+    EXPECT_EQ((size_t)29729, data.total());
+
+    cap >> data;
+    EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
+    EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
+    EXPECT_EQ((size_t)37118, data.total());
+}
+
+TEST(videoio_ffmpeg, open_with_property)
+{
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        throw SkipTestException("FFmpeg backend was not found");
+
+    string video_file = findDataFile("video/big_buck_bunny.mp4");
+    VideoCapture cap;
+    EXPECT_NO_THROW(cap.open(video_file, CAP_FFMPEG, {
+        CAP_PROP_FORMAT, -1  // demux only
+    }));
+
+    ffmpeg_check_read_raw(cap);
+}
+
+TEST(videoio_ffmpeg, create_with_property)
+{
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        throw SkipTestException("FFmpeg backend was not found");
+
+    string video_file = findDataFile("video/big_buck_bunny.mp4");
+    VideoCapture cap(video_file, CAP_FFMPEG, {
+        CAP_PROP_FORMAT, -1  // demux only
+    });
+
+    ffmpeg_check_read_raw(cap);
+}
+
+TEST(videoio_ffmpeg, create_with_property_badarg)
+{
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        throw SkipTestException("FFmpeg backend was not found");
+
+    string video_file = findDataFile("video/big_buck_bunny.mp4");
+    VideoCapture cap(video_file, CAP_FFMPEG, {
+        CAP_PROP_FORMAT, -2  // invalid
+    });
+    EXPECT_FALSE(cap.isOpened());
+}
+
+
 }} // namespace
diff --git a/modules/videoio/test/test_gstreamer.cpp b/modules/videoio/test/test_gstreamer.cpp
index 7bf8b750a1..ca100367b1 100644
--- a/modules/videoio/test/test_gstreamer.cpp
+++ b/modules/videoio/test/test_gstreamer.cpp
@@ -4,13 +4,12 @@
 
 #include "test_precomp.hpp"
 
-namespace opencv_test
-{
+namespace opencv_test { namespace {
 
 typedef tuple< string, Size, Size, int > Param;
 typedef testing::TestWithParam< Param > videoio_gstreamer;
 
-TEST_P(videoio_gstreamer, read_write)
+TEST_P(videoio_gstreamer, read_check)
 {
     if (!videoio_registry::hasBackend(CAP_GSTREAMER))
         throw SkipTestException("GStreamer backend was not found");
@@ -57,7 +56,7 @@ TEST_P(videoio_gstreamer, read_write)
     ASSERT_FALSE(cap.isOpened());
 }
 
-Param test_data[] = {
+static const Param test_data[] = {
     make_tuple("video/x-raw, format=BGR"  , Size(640, 480), Size(640, 480), COLOR_BGR2RGB),
     make_tuple("video/x-raw, format=GRAY8", Size(640, 480), Size(640, 480), COLOR_GRAY2RGB),
     make_tuple("video/x-raw, format=UYVY" , Size(640, 480), Size(640, 480), COLOR_YUV2RGB_UYVY),
@@ -68,27 +67,76 @@ Param test_data[] = {
     make_tuple("video/x-raw, format=YV12" , Size(640, 480), Size(640, 720), COLOR_YUV2RGB_YV12),
     make_tuple("video/x-raw, format=I420" , Size(640, 480), Size(640, 720), COLOR_YUV2RGB_I420),
     make_tuple("video/x-bayer"            , Size(640, 480), Size(640, 480), COLOR_BayerBG2RGB),
-    make_tuple("jpegenc ! image/jpeg"     , Size(640, 480), Size(640, 480), COLOR_BGR2RGB)
+    make_tuple("jpegenc ! image/jpeg"     , Size(640, 480), Size(640, 480), COLOR_BGR2RGB),
+
+    // unaligned cases, strides information must be used
+    make_tuple("video/x-raw, format=BGR"  , Size(322, 242), Size(322, 242), COLOR_BGR2RGB),
+    make_tuple("video/x-raw, format=GRAY8", Size(322, 242), Size(322, 242), COLOR_GRAY2RGB),
+    make_tuple("video/x-raw, format=NV12" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_NV12),
+    make_tuple("video/x-raw, format=NV21" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_NV21),
+    make_tuple("video/x-raw, format=YV12" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_YV12),
+    make_tuple("video/x-raw, format=I420" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_I420),
 };
 
 INSTANTIATE_TEST_CASE_P(videoio, videoio_gstreamer, testing::ValuesIn(test_data));
 
-TEST(Videoio_GStreamer, unsupported_pipeline)
+TEST(videoio_gstreamer, unsupported_pipeline)
 {
-    VideoCaptureAPIs apiPref = CAP_GSTREAMER;
-    if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends()))
-        throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
+    if (!videoio_registry::hasBackend(CAP_GSTREAMER))
+        throw SkipTestException("GStreamer backend was not found");
 
     // could not link videoconvert0 to matroskamux0, matroskamux0 can't handle caps video/x-raw, format=(string)RGBA
     std::string pipeline = "appsrc ! videoconvert ! video/x-raw, format=(string)RGBA ! matroskamux ! filesink location=test.mkv";
     Size frame_size(640, 480);
 
     VideoWriter writer;
-    EXPECT_NO_THROW(writer.open(pipeline, apiPref, 0/*fourcc*/, 30/*fps*/, frame_size, true));
+    EXPECT_NO_THROW(writer.open(pipeline, CAP_GSTREAMER, 0/*fourcc*/, 30/*fps*/, frame_size, true));
     EXPECT_FALSE(writer.isOpened());
     // no frames
     EXPECT_NO_THROW(writer.release());
 
 }
 
-} // namespace
+TEST(videoio_gstreamer, gray16_writing)
+{
+    if (!videoio_registry::hasBackend(CAP_GSTREAMER))
+        throw SkipTestException("GStreamer backend was not found");
+
+    Size frame_size(320, 240);
+
+    // generate a noise frame
+    Mat frame = Mat(frame_size, CV_16U);
+    randu(frame, 0, 65535);
+
+    // generate a temp filename, and fix path separators to how GStreamer expects them
+    cv::String temp_file = cv::tempfile(".raw");
+    std::replace(temp_file.begin(), temp_file.end(), '\\', '/');
+
+    // write noise frame to file using GStreamer
+    std::ostringstream writer_pipeline;
+    writer_pipeline << "appsrc ! filesink location=" << temp_file;
+    std::vector<int> params {
+        VIDEOWRITER_PROP_IS_COLOR, 0/*false*/,
+        VIDEOWRITER_PROP_DEPTH, CV_16U
+    };
+    VideoWriter writer;
+    ASSERT_NO_THROW(writer.open(writer_pipeline.str(), CAP_GSTREAMER, 0/*fourcc*/, 30/*fps*/, frame_size, params));
+    ASSERT_TRUE(writer.isOpened());
+    ASSERT_NO_THROW(writer.write(frame));
+    ASSERT_NO_THROW(writer.release());
+
+    // read noise frame back in
+    Mat written_frame(frame_size, CV_16U);
+    std::ifstream fs(temp_file, std::ios::in | std::ios::binary);
+    fs.read((char*)written_frame.ptr(0), frame_size.width * frame_size.height * 2);
+    ASSERT_TRUE(fs);
+    fs.close();
+
+    // compare to make sure it's identical
+    EXPECT_EQ(0, cv::norm(frame, written_frame, NORM_INF));
+
+    // remove temp file
+    EXPECT_EQ(0, remove(temp_file.c_str()));
+}
+
+}} // namespace
diff --git a/modules/videoio/test/test_main.cpp b/modules/videoio/test/test_main.cpp
index d248327046..d57a611c4b 100644
--- a/modules/videoio/test/test_main.cpp
+++ b/modules/videoio/test/test_main.cpp
@@ -2,9 +2,24 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
+#include <opencv2/core/utils/logger.hpp>
 
 #if defined(HAVE_HPX)
     #include <hpx/hpx_main.hpp>
 #endif
 
-CV_TEST_MAIN("highgui")
+static
+void initTests()
+{
+#ifndef WINRT  // missing getenv
+    const std::vector<cv::VideoCaptureAPIs> backends = cv::videoio_registry::getStreamBackends();
+    const char* requireFFmpeg = getenv("OPENCV_TEST_VIDEOIO_BACKEND_REQUIRE_FFMPEG");
+    if (requireFFmpeg && !isBackendAvailable(cv::CAP_FFMPEG, backends))
+    {
+        CV_LOG_FATAL(NULL, "OpenCV-Test: required FFmpeg backend is not available (broken plugin?). STOP.");
+        exit(1);
+    }
+#endif
+}
+
+CV_TEST_MAIN("highgui", initTests())
diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp
index 0b43cd0e4f..5bc2ccdf95 100644
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@@ -15,11 +15,35 @@
 
 namespace cv {
 
-inline std::ostream &operator<<(std::ostream &out, const VideoCaptureAPIs& api)
+static inline
+std::ostream& operator<<(std::ostream& out, const VideoCaptureAPIs& api)
 {
     out << cv::videoio_registry::getBackendName(api); return out;
 }
 
+static inline
+std::ostream& operator<<(std::ostream& out, const VideoAccelerationType& va_type)
+{
+    struct {
+        VideoAccelerationType va_type;
+        const char* str;
+    } va_types[] = {
+            {VIDEO_ACCELERATION_ANY,   "ANY"},
+            {VIDEO_ACCELERATION_NONE,  "NONE"},
+            {VIDEO_ACCELERATION_D3D11, "D3D11"},
+            {VIDEO_ACCELERATION_VAAPI, "VAAPI"},
+            {VIDEO_ACCELERATION_MFX,   "MFX"},
+    };
+    for (const auto& va : va_types) {
+        if (va_type == va.va_type) {
+            out << va.str;
+            return out;
+        }
+    }
+    out << cv::format("UNKNOWN(0x%ux)", static_cast<unsigned int>(va_type));
+    return out;
+}
+
 static inline void PrintTo(const cv::VideoCaptureAPIs& api, std::ostream* os)
 {
     *os << cv::videoio_registry::getBackendName(api);
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index c5f7e75403..f9a0be5345 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -233,6 +233,41 @@ public:
         else
             std::cout << "Frames counter is not available. Actual frames: " << count_actual << ". SKIP check." << std::endl;
     }
+
+    void doTimestampTest()
+    {
+        if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends()))
+            throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
+
+        // GStreamer: https://github.com/opencv/opencv/issues/19025
+        if (apiPref == CAP_GSTREAMER)
+            throw SkipTestException(cv::String("Backend ") +  cv::videoio_registry::getBackendName(apiPref) +
+                    cv::String(" does not return reliable values for CAP_PROP_POS_MSEC property"));
+
+        if (((apiPref == CAP_FFMPEG) && ((ext == "h264") || (ext == "h265"))))
+            throw SkipTestException(cv::String("Backend ") +  cv::videoio_registry::getBackendName(apiPref) +
+                    cv::String(" does not support CAP_PROP_POS_MSEC option"));
+
+        VideoCapture cap;
+        EXPECT_NO_THROW(cap.open(video_file, apiPref));
+        if (!cap.isOpened())
+            throw SkipTestException(cv::String("Backend ") +  cv::videoio_registry::getBackendName(apiPref) +
+                    cv::String(" can't open the video: ")  + video_file);
+
+        Mat img;
+        for(int i = 0; i < 10; i++)
+        {
+            double timestamp = 0;
+            ASSERT_NO_THROW(cap >> img);
+            EXPECT_NO_THROW(timestamp = cap.get(CAP_PROP_POS_MSEC));
+            if (cvtest::debugLevel > 0)
+                std::cout << "i = " << i << ": timestamp = " << timestamp << std::endl;
+            const double frame_period = 1000.f/bunny_param.getFps();
+            // NOTE: eps == frame_period, because videoCapture returns frame begining timestamp or frame end
+            // timestamp depending on codec and back-end. So the first frame has timestamp 0 or frame_period.
+            EXPECT_NEAR(timestamp, i*frame_period, frame_period) << "i=" << i;
+        }
+    }
 };
 
 //==================================================================================================
@@ -353,6 +388,8 @@ TEST_P(videoio_bunny, read_position) { doTest(); }
 
 TEST_P(videoio_bunny, frame_count) { doFrameCountTest(); }
 
+TEST_P(videoio_bunny, frame_timestamp) { doTimestampTest(); }
+
 INSTANTIATE_TEST_CASE_P(videoio, videoio_bunny,
                           testing::Combine(
                               testing::ValuesIn(bunny_params),
@@ -394,11 +431,11 @@ static Ext_Fourcc_PSNR synthetic_params[] = {
     {"mkv", "MPEG", 30.f, CAP_FFMPEG},
     {"mkv", "MJPG", 30.f, CAP_FFMPEG},
 
-    {"avi", "MPEG", 30.f, CAP_GSTREAMER},
+    {"avi", "MPEG", 28.f, CAP_GSTREAMER},
     {"avi", "MJPG", 30.f, CAP_GSTREAMER},
     {"avi", "H264", 30.f, CAP_GSTREAMER},
 
-    {"mkv", "MPEG", 30.f, CAP_GSTREAMER},
+    {"mkv", "MPEG", 28.f, CAP_GSTREAMER},
     {"mkv", "MJPG", 30.f, CAP_GSTREAMER},
     {"mkv", "H264", 30.f, CAP_GSTREAMER},
 
@@ -579,4 +616,361 @@ static vector<Ext_Fourcc_API> generate_Ext_Fourcc_API_nocrash()
 
 INSTANTIATE_TEST_CASE_P(videoio, Videoio_Writer_bad_fourcc, testing::ValuesIn(generate_Ext_Fourcc_API_nocrash()));
 
+typedef testing::TestWithParam<VideoCaptureAPIs> safe_capture;
+
+TEST_P(safe_capture, frames_independency)
+{
+    VideoCaptureAPIs apiPref = GetParam();
+    if (!videoio_registry::hasBackend(apiPref))
+        throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
+
+    VideoCapture cap;
+    String video_file = BunnyParameters::getFilename(String(".avi"));
+    EXPECT_NO_THROW(cap.open(video_file, apiPref));
+    if (!cap.isOpened())
+    {
+        std::cout << "SKIP test: backend " << apiPref << " can't open the video: " << video_file << std::endl;
+        return;
+    }
+
+    Mat frames[10];
+    Mat hardCopies[10];
+    for(int i = 0; i < 10; i++)
+    {
+        ASSERT_NO_THROW(cap >> frames[i]);
+        EXPECT_FALSE(frames[i].empty());
+        hardCopies[i] = frames[i].clone();
+    }
+
+    for(int i = 0; i < 10; i++)
+        EXPECT_EQ(0, cv::norm(frames[i], hardCopies[i], NORM_INF)) << i;
+}
+
+static VideoCaptureAPIs safe_apis[] = {CAP_FFMPEG, CAP_GSTREAMER, CAP_MSMF,CAP_AVFOUNDATION};
+INSTANTIATE_TEST_CASE_P(videoio, safe_capture, testing::ValuesIn(safe_apis));
+
+//==================================================================================================
+// TEST_P(videocapture_acceleration, ...)
+
+struct VideoCaptureAccelerationInput
+{
+    const char* filename;
+    double psnr_threshold;
+};
+
+static inline
+std::ostream& operator<<(std::ostream& out, const VideoCaptureAccelerationInput& p)
+{
+    out << p.filename;
+    return out;
+}
+
+typedef testing::TestWithParam<tuple<VideoCaptureAccelerationInput, VideoCaptureAPIs, VideoAccelerationType, bool>> videocapture_acceleration;
+
+TEST_P(videocapture_acceleration, read)
+{
+    auto param = GetParam();
+    std::string filename = get<0>(param).filename;
+#if 0  // FIXIT OpenCV 5.0 putText result
+    double psnr_threshold = get<0>(param).psnr_threshold;
+#else
+    double psnr_threshold = 10;
+#endif
+    VideoCaptureAPIs backend = get<1>(param);
+    VideoAccelerationType va_type = get<2>(param);
+    bool use_umat = get<3>(param);
+    int device_idx = -1;
+    const int frameNum = 15;
+
+    std::string filepath = cvtest::findDataFile("video/" + filename);
+
+    if (backend == CAP_MSMF && (
+        filename == "sample_322x242_15frames.yuv420p.mjpeg.mp4" ||
+        filename == "sample_322x242_15frames.yuv420p.libx265.mp4" ||
+        filename == "sample_322x242_15frames.yuv420p.libaom-av1.mp4" ||
+        filename == "sample_322x242_15frames.yuv420p.mpeg2video.mp4"
+    ))
+        throw SkipTestException("Format/codec is not supported");
+
+
+    std::string backend_name = cv::videoio_registry::getBackendName(backend);
+    if (!videoio_registry::hasBackend(backend))
+        throw SkipTestException(cv::String("Backend is not available/disabled: ") + backend_name);
+
+
+    // HW reader
+    VideoCapture hw_reader(filepath, backend, {
+            CAP_PROP_HW_ACCELERATION, static_cast<int>(va_type),
+            CAP_PROP_HW_DEVICE, device_idx
+    });
+    if (!hw_reader.isOpened())
+    {
+        if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
+        {
+            // ANY HW acceleration should have fallback to SW codecs
+            VideoCapture sw_reader(filepath, backend, {
+                    CAP_PROP_HW_ACCELERATION, VIDEO_ACCELERATION_NONE
+            });
+            if (!sw_reader.isOpened())
+                throw SkipTestException(backend_name + " VideoCapture on " + filename + " not supported, skipping");
+
+            ASSERT_TRUE(hw_reader.isOpened()) << "ANY HW acceleration should have fallback to SW codecs";
+        }
+        else
+        {
+            throw SkipTestException(backend_name + " VideoCapture on " + filename + " not supported with HW acceleration, skipping");
+        }
+    }
+
+    VideoAccelerationType actual_va = static_cast<VideoAccelerationType>(static_cast<int>(hw_reader.get(CAP_PROP_HW_ACCELERATION)));
+    if (va_type != VIDEO_ACCELERATION_ANY && va_type != VIDEO_ACCELERATION_NONE)
+    {
+        ASSERT_EQ((int)actual_va, (int)va_type) << "actual_va=" << actual_va << ", va_type=" << va_type;
+    }
+    std::cout << "VideoCapture " << backend_name << ":" << actual_va << std::endl << std::flush;
+
+    double min_psnr_original = 1000;
+    for (int i = 0; i < frameNum; i++)
+    {
+        SCOPED_TRACE(cv::format("frame=%d", i));
+        Mat frame;
+        if (use_umat)
+        {
+            UMat umat;
+            EXPECT_TRUE(hw_reader.read(umat));
+            ASSERT_FALSE(umat.empty());
+            umat.copyTo(frame);
+        }
+        else
+        {
+            EXPECT_TRUE(hw_reader.read(frame));
+        }
+        ASSERT_FALSE(frame.empty());
+
+        if (cvtest::debugLevel > 0)
+        {
+            imwrite(cv::format("test_frame%03d.png", i), frame);
+        }
+
+        Mat original(frame.size(), CV_8UC3, Scalar::all(0));
+        generateFrame(i, frameNum, original);
+        double psnr = cvtest::PSNR(frame, original);
+        if (psnr < min_psnr_original)
+            min_psnr_original = psnr;
+    }
+
+    std::ostringstream ss; ss << actual_va;
+    std::string actual_va_str = ss.str();
+    std::cout << "VideoCapture with acceleration = " << cv::format("%-6s @ %-10s", actual_va_str.c_str(), backend_name.c_str())
+            << " on " << filename
+            << " with PSNR-original = " << min_psnr_original
+            << std::endl << std::flush;
+    EXPECT_GE(min_psnr_original, psnr_threshold);
+}
+
+static const VideoCaptureAccelerationInput hw_filename[] = {
+        { "sample_322x242_15frames.yuv420p.libxvid.mp4", 28.0 },
+        { "sample_322x242_15frames.yuv420p.mjpeg.mp4", 20.0 },
+        { "sample_322x242_15frames.yuv420p.mpeg2video.mp4", 24.0 },  // GSTREAMER on Ubuntu 18.04
+        { "sample_322x242_15frames.yuv420p.libx264.mp4", 24.0 },  // GSTREAMER on Ubuntu 18.04
+        { "sample_322x242_15frames.yuv420p.libx265.mp4", 30.0 },
+        { "sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", 30.0 },
+        { "sample_322x242_15frames.yuv420p.libaom-av1.mp4", 30.0 }
+};
+
+static const VideoCaptureAPIs hw_backends[] = {
+        CAP_FFMPEG,
+        CAP_GSTREAMER,
+#ifdef _WIN32
+        CAP_MSMF,
+#endif
+};
+
+static const VideoAccelerationType hw_types[] = {
+        VIDEO_ACCELERATION_NONE,
+        VIDEO_ACCELERATION_ANY,
+        VIDEO_ACCELERATION_MFX,
+#ifdef _WIN32
+        VIDEO_ACCELERATION_D3D11,
+#else
+        VIDEO_ACCELERATION_VAAPI,
+#endif
+};
+
+static bool hw_use_umat[] = {
+        false,
+        //true
+};
+
+INSTANTIATE_TEST_CASE_P(videoio, videocapture_acceleration, testing::Combine(
+    testing::ValuesIn(hw_filename),
+    testing::ValuesIn(hw_backends),
+    testing::ValuesIn(hw_types),
+    testing::ValuesIn(hw_use_umat)
+));
+
+////////////////////////////////////////// TEST_P(video_acceleration, write_read)
+
+typedef tuple<Ext_Fourcc_PSNR, VideoAccelerationType, bool> VATestParams;
+
+typedef testing::TestWithParam<VATestParams> videowriter_acceleration;
+
+TEST_P(videowriter_acceleration, write)
+{
+    auto param = GetParam();
+    VideoCaptureAPIs backend = get<0>(param).api;
+    std::string codecid = get<0>(param).fourcc;
+    std::string extension = get<0>(param).ext;
+    double psnr_threshold = get<0>(param).PSNR;
+    VideoAccelerationType va_type = get<1>(param);
+    int device_idx = -1;
+    bool use_umat = get<2>(param);
+    std::string backend_name = cv::videoio_registry::getBackendName(backend);
+    if (!videoio_registry::hasBackend(backend))
+        throw SkipTestException(cv::String("Backend is not available/disabled: ") + backend_name);
+
+    const Size sz(640, 480);
+    const int frameNum = 15;
+    const double fps = 25;
+
+    std::string filename = tempfile("videowriter_acceleration.") + extension;
+
+    // Write video
+    VideoAccelerationType actual_va;
+    {
+        VideoWriter hw_writer(
+            filename,
+            backend,
+            VideoWriter::fourcc(codecid[0], codecid[1], codecid[2], codecid[3]),
+            fps,
+            sz,
+            {
+                VIDEOWRITER_PROP_HW_ACCELERATION, static_cast<int>(va_type),
+                VIDEOWRITER_PROP_HW_DEVICE, device_idx
+            }
+        );
+
+        if (!hw_writer.isOpened()) {
+            if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
+            {
+                // ANY HW acceleration should have fallback to SW codecs
+                {
+                    VideoWriter sw_writer(
+                        filename,
+                        backend,
+                        VideoWriter::fourcc(codecid[0], codecid[1], codecid[2], codecid[3]),
+                        fps,
+                        sz,
+                        {
+                            VIDEOWRITER_PROP_HW_ACCELERATION, VIDEO_ACCELERATION_NONE,
+                        }
+                    );
+                    if (!sw_writer.isOpened()) {
+                        remove(filename.c_str());
+                        throw SkipTestException(backend_name + " VideoWriter on codec " + codecid + " not supported, skipping");
+                    }
+                }
+                remove(filename.c_str());
+                ASSERT_TRUE(hw_writer.isOpened()) << "ANY HW acceleration should have fallback to SW codecs";
+            } else {
+                throw SkipTestException(backend_name + " VideoWriter on " + filename + " not supported with HW acceleration, skipping");
+            }
+        }
+
+        actual_va = static_cast<VideoAccelerationType>(static_cast<int>(hw_writer.get(VIDEOWRITER_PROP_HW_ACCELERATION)));
+        if (va_type != VIDEO_ACCELERATION_ANY && va_type != VIDEO_ACCELERATION_NONE)
+        {
+            ASSERT_EQ((int)actual_va, (int)va_type) << "actual_va=" << actual_va << ", va_type=" << va_type;
+        }
+        std::cout << "VideoWriter " << backend_name << ":" << actual_va << std::endl << std::flush;
+
+        Mat frame(sz, CV_8UC3);
+        for (int i = 0; i < frameNum; ++i) {
+            generateFrame(i, frameNum, frame);
+            if (use_umat) {
+                UMat umat;
+                frame.copyTo(umat);
+                hw_writer.write(umat);
+            }
+            else {
+                hw_writer.write(frame);
+            }
+        }
+    }
+
+    std::ifstream ofile(filename, std::ios::binary);
+    ofile.seekg(0, std::ios::end);
+    int64 fileSize = (int64)ofile.tellg();
+    ASSERT_GT(fileSize, 0);
+    std::cout << "File size: " << fileSize << std::endl;
+
+    // Read video and check PSNR on every frame
+    {
+        VideoCapture reader(
+            filename,
+            CAP_ANY /*backend*/,
+            { CAP_PROP_HW_ACCELERATION, VIDEO_ACCELERATION_NONE }
+        );
+        ASSERT_TRUE(reader.isOpened());
+        double min_psnr = 1000;
+        Mat reference(sz, CV_8UC3);
+        for (int i = 0; i < frameNum; ++i) {
+            Mat actual;
+            if (use_umat) {
+                UMat umat;
+                EXPECT_TRUE(reader.read(umat));
+                umat.copyTo(actual);
+            }
+            else {
+                EXPECT_TRUE(reader.read(actual));
+            }
+            EXPECT_FALSE(actual.empty());
+            generateFrame(i, frameNum, reference);
+            EXPECT_EQ(reference.size(), actual.size());
+            EXPECT_EQ(reference.depth(), actual.depth());
+            EXPECT_EQ(reference.channels(), actual.channels());
+            double psnr = cvtest::PSNR(actual, reference);
+            EXPECT_GE(psnr, psnr_threshold) << " frame " << i;
+            if (psnr < min_psnr)
+                min_psnr = psnr;
+        }
+        Mat actual;
+        EXPECT_FALSE(reader.read(actual));
+        {
+            std::ostringstream ss; ss << actual_va;
+            std::string actual_va_str = ss.str();
+            std::cout << "VideoWriter with acceleration = " << cv::format("%-6s @ %-10s", actual_va_str.c_str(), backend_name.c_str())
+                    << " on codec=" << codecid << " (." << extension << ")"
+                    << ", bitrate = " << fileSize / (frameNum / fps)
+                    << ", with PSNR-original = " << min_psnr
+                    << std::endl << std::flush;
+        }
+        remove(filename.c_str());
+    }
+}
+
+static Ext_Fourcc_PSNR hw_codecs[] = {
+        {"mp4", "MPEG", 29.f, CAP_FFMPEG},
+        {"mp4", "H264", 29.f, CAP_FFMPEG},
+        {"mp4", "HEVC", 29.f, CAP_FFMPEG},
+        {"avi", "MJPG", 29.f, CAP_FFMPEG},
+        {"avi", "XVID", 29.f, CAP_FFMPEG},
+        //{"webm", "VP8", 29.f, CAP_FFMPEG},
+        //{"webm", "VP9", 29.f, CAP_FFMPEG},
+
+        {"mkv", "MPEG", 29.f, CAP_GSTREAMER},
+        {"mkv", "H264", 29.f, CAP_GSTREAMER},
+
+#ifdef _WIN32
+        {"mp4", "MPEG", 29.f, CAP_MSMF},
+        {"mp4", "H264", 29.f, CAP_MSMF},
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(videoio, videowriter_acceleration, testing::Combine(
+        testing::ValuesIn(hw_codecs),
+        testing::ValuesIn(hw_types),
+        testing::ValuesIn(hw_use_umat)
+));
+
 } // namespace
diff --git a/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties b/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties.in
similarity index 87%
rename from platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties
rename to platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties.in
index 9a4163a4f5..9084063bb0 100644
--- a/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties
+++ b/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties.in
@@ -1,5 +1,5 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.6-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-@GRADLE_VERSION@-all.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/platforms/android/ndk-18-api-level-24.config.py b/platforms/android/ndk-18-api-level-24.config.py
new file mode 100644
index 0000000000..c995885c1f
--- /dev/null
+++ b/platforms/android/ndk-18-api-level-24.config.py
@@ -0,0 +1,6 @@
+ABIs = [
+    ABI("2", "armeabi-v7a", None, 24, cmake_vars=dict(ANDROID_ABI='armeabi-v7a with NEON')),
+    ABI("3", "arm64-v8a",   None, 24),
+    ABI("5", "x86_64",      None, 24),
+    ABI("4", "x86",         None, 24),
+]
diff --git a/platforms/android/ndk-22.config.py b/platforms/android/ndk-22.config.py
new file mode 100644
index 0000000000..69321e0f0c
--- /dev/null
+++ b/platforms/android/ndk-22.config.py
@@ -0,0 +1,6 @@
+ABIs = [
+    ABI("2", "armeabi-v7a", None, cmake_vars=dict(ANDROID_ABI='armeabi-v7a with NEON', ANDROID_GRADLE_PLUGIN_VERSION='4.1.2', GRADLE_VERSION='6.5')),
+    ABI("3", "arm64-v8a",   None, cmake_vars=dict(ANDROID_GRADLE_PLUGIN_VERSION='4.1.2', GRADLE_VERSION='6.5')),
+    ABI("5", "x86_64",      None, cmake_vars=dict(ANDROID_GRADLE_PLUGIN_VERSION='4.1.2', GRADLE_VERSION='6.5')),
+    ABI("4", "x86",         None, cmake_vars=dict(ANDROID_GRADLE_PLUGIN_VERSION='4.1.2', GRADLE_VERSION='6.5')),
+]
diff --git a/platforms/apple/build_xcframework.py b/platforms/apple/build_xcframework.py
index e8a77d230f..afea5e4691 100755
--- a/platforms/apple/build_xcframework.py
+++ b/platforms/apple/build_xcframework.py
@@ -4,7 +4,7 @@ This script builds OpenCV into an xcframework compatible with the platforms
 of your choice. Just run it and grab a snack; you'll be waiting a while.
 """
 
-import sys, os, argparse, pathlib, traceback
+import sys, os, argparse, pathlib, traceback, contextlib, shutil
 from cv_build_utils import execute, print_error, print_header, get_xcode_version, get_cmake_version
 
 if __name__ == "__main__":
@@ -27,7 +27,7 @@ if __name__ == "__main__":
         Any arguments that are not recognized by this script are passed through to the ios/osx build_framework.py scripts.
         """
     parser = argparse.ArgumentParser(description=description, epilog=epilog)
-    parser.add_argument('out', metavar='OUTDIR', help='The directory where the xcframework will be created')
+    parser.add_argument('-o', '--out', metavar='OUTDIR', help='<Required> The directory where the xcframework will be created', required=True)
     parser.add_argument('--framework_name', default='opencv2', help='Name of OpenCV xcframework (default: opencv2, will change to OpenCV in future version)')
     parser.add_argument('--iphoneos_archs', default=None, help='select iPhoneOS target ARCHS. Default is "armv7,arm64"')
     parser.add_argument('--iphonesimulator_archs', default=None, help='select iPhoneSimulator target ARCHS. Default is "x86_64,arm64"')
@@ -67,59 +67,66 @@ if __name__ == "__main__":
     # Build phase
 
     try:
-        # Build .frameworks for each platform
+        # Phase 1: build .frameworks for each platform
         osx_script_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../osx/build_framework.py')
         ios_script_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../ios/build_framework.py')
 
         build_folders = []
 
         def get_or_create_build_folder(base_dir, platform):
-            build_folder = "./{}/{}".format(base_dir, platform).replace(" ", "\\ ")  # Escape spaces in output path
+            build_folder = "{}/{}".format(base_dir, platform).replace(" ", "\\ ")  # Escape spaces in output path
             pathlib.Path(build_folder).mkdir(parents=True, exist_ok=True)
             return build_folder
 
         if iphoneos_archs:
             build_folder = get_or_create_build_folder(args.out, "iphoneos")
             build_folders.append(build_folder)
-            command = ["python3", ios_script_path, "--iphoneos_archs", iphoneos_archs, "--framework_name", args.framework_name, "--build_only_specified_archs", build_folder] + unknown_args
+            command = ["python3", ios_script_path, build_folder, "--iphoneos_archs", iphoneos_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
             print_header("Building iPhoneOS frameworks")
             print(command)
             execute(command, cwd=os.getcwd())
         if iphonesimulator_archs:
             build_folder = get_or_create_build_folder(args.out, "iphonesimulator")
             build_folders.append(build_folder)
-            command = ["python3", ios_script_path, "--iphonesimulator_archs", iphonesimulator_archs, "--framework_name", args.framework_name, "--build_only_specified_archs", build_folder] + unknown_args
+            command = ["python3", ios_script_path, build_folder, "--iphonesimulator_archs", iphonesimulator_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
             print_header("Building iPhoneSimulator frameworks")
             execute(command, cwd=os.getcwd())
         if macos_archs:
             build_folder = get_or_create_build_folder(args.out, "macos")
             build_folders.append(build_folder)
-            command = ["python3", osx_script_path, "--macos_archs", macos_archs, "--framework_name", args.framework_name, "--build_only_specified_archs", build_folder] + unknown_args
+            command = ["python3", osx_script_path, build_folder, "--macos_archs", macos_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
             print_header("Building MacOS frameworks")
             execute(command, cwd=os.getcwd())
         if catalyst_archs:
             build_folder = get_or_create_build_folder(args.out, "catalyst")
             build_folders.append(build_folder)
-            command = ["python3", osx_script_path, "--catalyst_archs", catalyst_archs, "--framework_name", args.framework_name, "--build_only_specified_archs", build_folder] + unknown_args
+            command = ["python3", osx_script_path, build_folder, "--catalyst_archs", catalyst_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
             print_header("Building Catalyst frameworks")
             execute(command, cwd=os.getcwd())
 
-        # Put all the built .frameworks together into a .xcframework
-        print_header("Building xcframework")
+        # Phase 2: put all the built .frameworks together into a .xcframework
+
+        xcframework_path = "{}/{}.xcframework".format(args.out, args.framework_name)
+        print_header("Building {}".format(xcframework_path))
+
+        # Remove the xcframework if it exists, otherwise the existing
+        # file will cause the xcodebuild command to fail.
+        with contextlib.suppress(FileNotFoundError):
+            shutil.rmtree(xcframework_path)
+            print("Removed existing xcframework at {}".format(xcframework_path))
 
-        framework_path = "{}/{}.xcframework".format(args.out, args.framework_name)
         xcframework_build_command = [
             "xcodebuild",
             "-create-xcframework",
             "-output",
-            framework_path,
+            xcframework_path,
         ]
         for folder in build_folders:
             xcframework_build_command += ["-framework", "{}/{}.framework".format(folder, args.framework_name)]
         execute(xcframework_build_command, cwd=os.getcwd())
 
         print("")
-        print_header("Finished building {}".format(framework_path))
+        print_header("Finished building {}".format(xcframework_path))
     except Exception as e:
         print_error(e)
         traceback.print_exc(file=sys.stderr)
diff --git a/platforms/apple/readme.md b/platforms/apple/readme.md
index f12446c060..222af109fd 100644
--- a/platforms/apple/readme.md
+++ b/platforms/apple/readme.md
@@ -11,7 +11,7 @@ You'll need the following to run these steps:
 You can then run build_xcframework.py, as below:
 ```
 cd ~/<my_working_directory>
-python opencv/platforms/apple/build_xcframework.py ./build_xcframework
+python opencv/platforms/apple/build_xcframework.py --out ./build_xcframework
 ```
 
 Grab a coffee, because you'll be here for a while. By default this builds OpenCV for 8 architectures across 4 platforms:
@@ -25,16 +25,32 @@ If everything's fine, you will eventually get `opencv2.xcframework` in the outpu
 
 The script has some configuration options to exclude platforms and architectures you don't want to build for. Use the `--help` flag for more information.
 
+## How it Works
+
+This script generates a fat `.framework` for each platform you specify, and stitches them together into a `.xcframework`. This file can be used to support the same architecture on different platforms, which fat `.framework`s don't allow. To build the intermediate `.framework`s, `build_xcframework.py` leverages the `build_framework.py` scripts in the ios and osx platform folders.
+
+### Passthrough Arguments
+
+Any arguments that aren't recognized by `build_xcframework.py` will be passed to the platform-specific `build_framework.py` scripts. The `--without` flag mentioned in the examples is an example of this in action. For more info, see the `--help` info for those scripts.
+
 ## Examples
 
 You may override the defaults by specifying a value for any of the `*_archs` flags. For example, if you want to build for arm64 on every platform, you can do this:
 
 ```
-python build_xcframework.py somedir --iphoneos_archs arm64 --iphonesimulator_archs arm64 --macos_archs arm64 --catalyst_archs arm64
+python build_xcframework.py --out somedir --iphoneos_archs arm64 --iphonesimulator_archs arm64 --macos_archs arm64 --catalyst_archs arm64
 ```
 
+
 If you want to build only for certain platforms, you can supply the `--build_only_specified_archs` flag, which makes the script build only the archs you directly ask for. For example, to build only for Catalyst, you can do this:
 
 ```
-python build_xcframework.py somedir --catalyst_archs x86_64,arm64 --build_only_specified_archs
+python build_xcframework.py --out somedir --catalyst_archs x86_64,arm64 --build_only_specified_archs
 ```
+
+You can also build without OpenCV functionality you don't need. You can do this by using the `--without` flag, which you use once per item you want to go without. For example, if you wanted to compile without `video` or `objc`, you'd can do this:
+
+```
+python build_xcframework.py --out somedir --without video --without objc
+```
+(if you have issues with this, try using `=`, e.g. `--without=video --without=objc`, and file an issue on GitHub.)
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index 4489475ef2..b0178e036c 100755
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -37,7 +37,7 @@ from subprocess import check_call, check_output, CalledProcessError
 from distutils.dir_util import copy_tree
 
 sys.path.insert(0, os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../apple'))
-from cv_build_utils import execute, print_error, get_xcode_major, get_xcode_setting
+from cv_build_utils import execute, print_error, get_xcode_major, get_xcode_setting, get_xcode_version, get_cmake_version
 
 IPHONEOS_DEPLOYMENT_TARGET='9.0'  # default, can be changed via command line options or environment variable
 
@@ -64,6 +64,12 @@ class Builder:
         self.run_tests = run_tests
         self.build_docs = build_docs
 
+    def checkCMakeVersion(self):
+        if get_xcode_version() >= (12, 2):
+            assert get_cmake_version() >= (3, 19), "CMake 3.19 or later is required when building with Xcode 12.2 or greater. Current version is {}".format(get_cmake_version())
+        else:
+            assert get_cmake_version() >= (3, 17), "CMake 3.17 or later is required. Current version is {}".format(get_cmake_version())
+
     def getBuildDir(self, parent, target):
 
         res = os.path.join(parent, 'build-%s-%s' % (target[0].lower(), target[1].lower()))
@@ -73,6 +79,7 @@ class Builder:
         return os.path.abspath(res)
 
     def _build(self, outdir):
+        self.checkCMakeVersion()
         outdir = os.path.abspath(outdir)
         if not os.path.isdir(outdir):
             os.makedirs(outdir)
@@ -483,11 +490,12 @@ class iOSBuilder(Builder):
 if __name__ == "__main__":
     folder = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../.."))
     parser = argparse.ArgumentParser(description='The script builds OpenCV.framework for iOS.')
+    # TODO: When we can make breaking changes, we should make the out argument explicit and required like in build_xcframework.py.
     parser.add_argument('out', metavar='OUTDIR', help='folder to put built framework')
     parser.add_argument('--opencv', metavar='DIR', default=folder, help='folder with opencv repository (default is "../.." relative to script location)')
     parser.add_argument('--contrib', metavar='DIR', default=None, help='folder with opencv_contrib repository (default is "None" - build only main framework)')
-    parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework')
-    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF)')
+    parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework. To exclude multiple, specify this flag again, e.g. "--without video --without objc"')
+    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF). To disable multiple, specify this flag again, e.g. "--disable tbb --disable openmp"')
     parser.add_argument('--dynamic', default=False, action='store_true', help='build dynamic framework (default is "False" - builds static framework)')
     parser.add_argument('--disable-bitcode', default=False, dest='bitcodedisabled', action='store_true', help='disable bitcode (enabled by default)')
     parser.add_argument('--iphoneos_deployment_target', default=os.environ.get('IPHONEOS_DEPLOYMENT_TARGET', IPHONEOS_DEPLOYMENT_TARGET), help='specify IPHONEOS_DEPLOYMENT_TARGET')
diff --git a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
index 4cbe4f1729..a5ec05a821 100644
--- a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
+++ b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
@@ -163,10 +163,20 @@ set(CMAKE_CXX_COMPILER_ABI ELF)
 set(CMAKE_CXX_COMPILER_WORKS TRUE)
 set(CMAKE_C_COMPILER_WORKS TRUE)
 
-# Search for programs in the build host directories
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-#   for libraries and headers in the target directories
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+endif()
+
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endif()
+
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+endif()
+
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+endif()
 
 toolchain_save_config(IOS_ARCH IPHONEOS_DEPLOYMENT_TARGET)
diff --git a/platforms/js/README.md b/platforms/js/README.md
index 1db8a84a4b..17c17ee1f7 100644
--- a/platforms/js/README.md
+++ b/platforms/js/README.md
@@ -1,11 +1,11 @@
 Building OpenCV.js by Emscripten
 ====================
 
-[Download and install Emscripten](https://kripken.github.io/emscripten-site/docs/getting_started/downloads.html).
+[Download and install Emscripten](https://emscripten.org/docs/getting_started/downloads.html).
 
 Execute `build_js.py` script:
 ```
-python <opencv_src_dir>/platforms/js/build_js.py <build_dir>
+emcmake python <opencv_src_dir>/platforms/js/build_js.py <build_dir>
 ```
 
 If everything is fine, a few minutes later you will get `<build_dir>/bin/opencv.js`. You can add this into your web pages.
diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py
index e741037700..457d14bae6 100755
--- a/platforms/js/build_js.py
+++ b/platforms/js/build_js.py
@@ -77,7 +77,9 @@ class Builder:
             rm_one(d)
 
     def get_cmake_cmd(self):
-        cmd = ["cmake",
+        cmd = [
+            "cmake",
+            "-DPYTHON_DEFAULT_EXECUTABLE=%s" % sys.executable,
                "-DENABLE_PIC=FALSE", # To workaround emscripten upstream backend issue https://github.com/emscripten-core/emscripten/issues/8761
                "-DCMAKE_BUILD_TYPE=Release",
                "-DCMAKE_TOOLCHAIN_FILE='%s'" % self.get_toolchain_file(),
@@ -113,7 +115,7 @@ class Builder:
                "-DWITH_GPHOTO2=OFF",
                "-DWITH_LAPACK=OFF",
                "-DWITH_ITT=OFF",
-               "-DWITH_QUIRC=OFF",
+               "-DWITH_QUIRC=ON",
                "-DBUILD_ZLIB=ON",
                "-DBUILD_opencv_apps=OFF",
                "-DBUILD_opencv_3d=ON",
@@ -131,11 +133,9 @@ class Builder:
                "-DBUILD_opencv_superres=OFF",
                "-DBUILD_opencv_stitching=OFF",
                "-DBUILD_opencv_java=OFF",
-               "-DBUILD_opencv_java_bindings_generator=OFF",
                "-DBUILD_opencv_js=ON",
                "-DBUILD_opencv_python2=OFF",
                "-DBUILD_opencv_python3=OFF",
-               "-DBUILD_opencv_python_bindings_generator=OFF",
                "-DBUILD_EXAMPLES=OFF",
                "-DBUILD_PACKAGE=OFF",
                "-DBUILD_TESTS=OFF",
@@ -210,15 +210,19 @@ class Builder:
 #===================================================================================================
 
 if __name__ == "__main__":
+    log.basicConfig(format='%(message)s', level=log.DEBUG)
+
     opencv_dir = os.path.abspath(os.path.join(SCRIPT_DIR, '../..'))
     emscripten_dir = None
     if "EMSCRIPTEN" in os.environ:
         emscripten_dir = os.environ["EMSCRIPTEN"]
+    else:
+        log.warning("EMSCRIPTEN environment variable is not available. Please properly activate Emscripten SDK and consider using 'emcmake' launcher")
 
     parser = argparse.ArgumentParser(description='Build OpenCV.js by Emscripten')
     parser.add_argument("build_dir", help="Building directory (and output)")
     parser.add_argument('--opencv_dir', default=opencv_dir, help='Opencv source directory (default is "../.." relative to script location)')
-    parser.add_argument('--emscripten_dir', default=emscripten_dir, help="Path to Emscripten to use for build")
+    parser.add_argument('--emscripten_dir', default=emscripten_dir, help="Path to Emscripten to use for build (deprecated in favor of 'emcmake' launcher)")
     parser.add_argument('--build_wasm', action="store_true", help="Build OpenCV.js in WebAssembly format")
     parser.add_argument('--disable_wasm', action="store_true", help="Build OpenCV.js in Asm.js format")
     parser.add_argument('--threads', action="store_true", help="Build OpenCV.js with threads optimization")
@@ -242,13 +246,15 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    log.basicConfig(format='%(message)s', level=log.DEBUG)
     log.debug("Args: %s", args)
 
-    os.environ["OPENCV_JS_WHITELIST"] = args.config
+    os.environ["OPENCV_JS_WHITELIST"] = os.path.abspath(args.config)
+
+    if 'EMMAKEN_JUST_CONFIGURE' in os.environ:
+        del os.environ['EMMAKEN_JUST_CONFIGURE']  # avoid linker errors with NODERAWFS message then using 'emcmake' launcher
 
     if args.emscripten_dir is None:
-        log.info("Cannot get Emscripten path, please specify it either by EMSCRIPTEN environment variable or --emscripten_dir option.")
+        log.error("Cannot get Emscripten path, please use 'emcmake' launcher or specify it either by EMSCRIPTEN environment variable or --emscripten_dir option.")
         sys.exit(-1)
 
     builder = Builder(args)
diff --git a/platforms/js/opencv_js.config.py b/platforms/js/opencv_js.config.py
index 7dbe61e22f..8549c2b6ab 100644
--- a/platforms/js/opencv_js.config.py
+++ b/platforms/js/opencv_js.config.py
@@ -1,11 +1,17 @@
 # Classes and methods whitelist
-core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bitwise_or', 'bitwise_xor', 'cartToPolar',\
-             'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
-             'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
-             'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
-             'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
-             'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
-        'Algorithm': []}
+
+core = {
+    '': [
+        'absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bitwise_or', 'bitwise_xor', 'cartToPolar',
+        'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen',
+        'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude',
+        'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize',
+        'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed',
+        'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat',
+        'setLogLevel', 'getLogLevel',
+    ],
+    'Algorithm': [],
+}
 
 imgproc = {'': ['Canny', 'GaussianBlur', 'Laplacian', 'HoughLines', 'HoughLinesP', 'HoughCircles', 'Scharr','Sobel', \
                 'adaptiveThreshold','approxPolyDP','arcLength','bilateralFilter','blur','boundingRect','boxFilter',\
@@ -18,12 +24,26 @@ imgproc = {'': ['Canny', 'GaussianBlur', 'Laplacian', 'HoughLines', 'HoughLinesP
                 'matchShapes', 'matchTemplate','medianBlur', 'minAreaRect', 'minEnclosingCircle', 'moments', 'morphologyEx', \
                 'pointPolygonTest', 'putText','pyrDown','pyrUp','rectangle','remap', 'resize','sepFilter2D','threshold', \
                 'undistort','warpAffine','warpPerspective','warpPolar','watershed', \
-                'fillPoly', 'fillConvexPoly'],
-           'CLAHE': ['apply', 'collectGarbage', 'getClipLimit', 'getTilesGridSize', 'setClipLimit', 'setTilesGridSize']}
+                'fillPoly', 'fillConvexPoly', 'polylines',
+    ],
+    'CLAHE': ['apply', 'collectGarbage', 'getClipLimit', 'getTilesGridSize', 'setClipLimit', 'setTilesGridSize'],
+    'segmentation_IntelligentScissorsMB': [
+        'IntelligentScissorsMB',
+        'setWeights',
+        'setGradientMagnitudeMaxLimit',
+        'setEdgeFeatureZeroCrossingParameters',
+        'setEdgeFeatureCannyParameters',
+        'applyImage',
+        'applyImageFeatures',
+        'buildMap',
+        'getContour'
+    ],
+}
 
 objdetect = {'': ['groupRectangles'],
              'HOGDescriptor': ['load', 'HOGDescriptor', 'getDefaultPeopleDetector', 'getDaimlerPeopleDetector', 'setSVMDetector', 'detectMultiScale'],
-             'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale']}
+             'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale'],
+             'QRCodeDetector': ['QRCodeDetector', 'decode', 'decodeCurved', 'detect', 'detectAndDecode', 'detectMulti', 'setEpsX', 'setEpsY']}
 
 video = {'': ['CamShift', 'calcOpticalFlowFarneback', 'calcOpticalFlowPyrLK', 'createBackgroundSubtractorMOG2', \
              'findTransformECC', 'meanShift'],
@@ -74,9 +94,16 @@ aruco = {'': ['detectMarkers', 'drawDetectedMarkers', 'drawAxis', 'estimatePoseS
         'aruco_Board': ['create'],
         'aruco_GridBoard': ['create', 'draw'],
         'aruco_CharucoBoard': ['create', 'draw'],
+        'aruco_DetectorParameters': ['create']
         }
 
-_3d = {'': ['findHomography', 'calibrateCameraExtended', 'drawFrameAxes', 'estimateAffine2D', 'getDefaultNewCameraMatrix', 'initUndistortRectifyMap', 'Rodrigues']}
+_3d = {
+    '': [
+        'findHomography', 'calibrateCameraExtended', 'drawFrameAxes', 'estimateAffine2D',
+        'getDefaultNewCameraMatrix', 'initUndistortRectifyMap', 'Rodrigues',
+        'solvePnP', 'solvePnPRansac', 'solvePnPRefineLM'
+    ]
+}
 
 
 white_list = makeWhiteList([core, imgproc, objdetect, video, dnn, features2d, photo, aruco, _3d])
diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake
index 58e06ddf7b..c1c74ab9df 100644
--- a/platforms/linux/riscv64-clang.toolchain.cmake
+++ b/platforms/linux/riscv64-clang.toolchain.cmake
@@ -17,8 +17,8 @@ set(CMAKE_ASM_COMPILER_TARGET ${CLANG_TARGET_TRIPLE})
 # Don't run the linker on compiler check
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 
-set(CMAKE_C_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv0p9 -menable-experimental-extensions --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv0p9 -menable-experimental-extensions --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CXX_FLAGS}")
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
diff --git a/platforms/linux/riscv64-gcc.toolchain.cmake b/platforms/linux/riscv64-gcc.toolchain.cmake
new file mode 100644
index 0000000000..c46d62a360
--- /dev/null
+++ b/platforms/linux/riscv64-gcc.toolchain.cmake
@@ -0,0 +1,20 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+set(RISCV_GCC_INSTALL_ROOT /opt/RISCV CACHE PATH "Path to GCC for RISC-V cross compiler installation directory")
+set(CMAKE_SYSROOT ${RISCV_GCC_INSTALL_ROOT}/sysroot CACHE PATH "RISC-V sysroot")
+
+set(CMAKE_C_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-unknown-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-unknown-linux-gnu-g++)
+
+# Don't run the linker on compiler check
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
+set(CMAKE_C_FLAGS "-march=rv64gcv_zvqmac ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zvqmac ${CXX_FLAGS}")
+
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
\ No newline at end of file
diff --git a/platforms/osx/build_framework.py b/platforms/osx/build_framework.py
index 95c8268454..480f3e3957 100755
--- a/platforms/osx/build_framework.py
+++ b/platforms/osx/build_framework.py
@@ -10,12 +10,15 @@ import os, os.path, sys, argparse, traceback, multiprocessing
 sys.path.insert(0, os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../ios'))
 from build_framework import Builder
 sys.path.insert(0, os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../apple'))
-from cv_build_utils import print_error
+from cv_build_utils import print_error, get_cmake_version
 
 MACOSX_DEPLOYMENT_TARGET='10.12'  # default, can be changed via command line options or environment variable
 
 class OSXBuilder(Builder):
 
+    def checkCMakeVersion(self):
+        assert get_cmake_version() >= (3, 17), "CMake 3.17 or later is required. Current version is {}".format(get_cmake_version())
+
     def getObjcTarget(self, target):
         # Obj-C generation target
         if target == "Catalyst":
@@ -55,11 +58,12 @@ class OSXBuilder(Builder):
 if __name__ == "__main__":
     folder = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../.."))
     parser = argparse.ArgumentParser(description='The script builds OpenCV.framework for OSX.')
+    # TODO: When we can make breaking changes, we should make the out argument explicit and required like in build_xcframework.py.
     parser.add_argument('out', metavar='OUTDIR', help='folder to put built framework')
     parser.add_argument('--opencv', metavar='DIR', default=folder, help='folder with opencv repository (default is "../.." relative to script location)')
     parser.add_argument('--contrib', metavar='DIR', default=None, help='folder with opencv_contrib repository (default is "None" - build only main framework)')
-    parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework')
-    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF)')
+    parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework. To exclude multiple, specify this flag again, e.g. "--without video --without objc"')
+    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF). To disable multiple, specify this flag again, e.g. "--disable tbb --disable openmp"')
     parser.add_argument('--dynamic', default=False, action='store_true', help='build dynamic framework (default is "False" - builds static framework)')
     parser.add_argument('--enable_nonfree', default=False, dest='enablenonfree', action='store_true', help='enable non-free modules (disabled by default)')
     parser.add_argument('--macosx_deployment_target', default=os.environ.get('MACOSX_DEPLOYMENT_TARGET', MACOSX_DEPLOYMENT_TARGET), help='specify MACOSX_DEPLOYMENT_TARGET')
diff --git a/platforms/winpack_dldt/2021.2/20200413-dldt-pdb.patch b/platforms/winpack_dldt/2021.2/20200413-dldt-pdb.patch
new file mode 100644
index 0000000000..081c3c04f6
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/20200413-dldt-pdb.patch
@@ -0,0 +1,14 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 1f981ed2..90eb500a 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -22,6 +22,9 @@ endif()
+ 
+ project(OpenVINO)
+ 
++set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
++set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ list(APPEND CMAKE_MODULE_PATH "${OpenVINO_MAIN_SOURCE_DIR}/cmake")
diff --git a/platforms/winpack_dldt/2021.2/20200604-dldt-disable-multidevice.patch b/platforms/winpack_dldt/2021.2/20200604-dldt-disable-multidevice.patch
new file mode 100644
index 0000000000..b4d1ef9bfe
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/20200604-dldt-disable-multidevice.patch
@@ -0,0 +1,13 @@
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
++++ b/inference-engine/src/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(multi_device)
++#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
diff --git a/platforms/winpack_dldt/2021.2/20201217-dldt-disable-unused-targets.patch b/platforms/winpack_dldt/2021.2/20201217-dldt-disable-unused-targets.patch
new file mode 100644
index 0000000000..7e0c57eb63
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/20201217-dldt-disable-unused-targets.patch
@@ -0,0 +1,250 @@
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index a3e4f74c..190305a6 100644
+--- a/inference-engine/CMakeLists.txt
++++ b/inference-engine/CMakeLists.txt
+@@ -69,7 +69,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
++#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -88,7 +88,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
++#ie_build_samples()
+ 
+ file(GLOB_RECURSE SAMPLES_SOURCES samples/*.cpp samples/*.hpp samples/*.h)
+ add_cpplint_target(sample_cpplint
+@@ -179,7 +179,7 @@ endif()
+ # Developer package
+ #
+ 
+-ie_developer_export_targets(format_reader)
++#ie_developer_export_targets(format_reader)
+ ie_developer_export_targets(${NGRAPH_LIBRARIES})
+ 
+ # for Template plugin
+@@ -187,7 +187,7 @@ if(NGRAPH_INTERPRETER_ENABLE)
+     ie_developer_export_targets(ngraph_backend interpreter_backend)
+ endif()
+ 
+-ie_developer_export()
++#ie_developer_export()
+ 
+ configure_file(
+     "${IE_MAIN_SOURCE_DIR}/cmake/developer_package_config.cmake.in"
+diff --git a/inference-engine/cmake/add_ie_target.cmake b/inference-engine/cmake/add_ie_target.cmake
+index 35b96542..48dacfb3 100644
+--- a/inference-engine/cmake/add_ie_target.cmake
++++ b/inference-engine/cmake/add_ie_target.cmake
+@@ -91,7 +91,7 @@ function(addIeTarget)
+     if (ARG_TYPE STREQUAL EXECUTABLE)
+         add_executable(${ARG_NAME} ${all_sources})
+     elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
+-        add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
++        add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
+     else()
+         message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
+     endif()
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index f012a038..5204fb6a 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
++++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -99,7 +99,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -162,7 +162,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             ${IE_STATIC_DEPENDENT_FILES})
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index fab2f68d..864953a1 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
++++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -22,7 +22,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index f52926d6..dd039e29 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -194,7 +194,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+                                                       $<TARGET_PROPERTY:inference_engine_legacy,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index d47dfb35..a9046654 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
++++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -101,7 +101,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index bd97c2c6..d89cdaa5 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
++++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+ 
+ function(add_common_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     ie_faster_build(${TARGET_NAME}
+         UNITY
+@@ -62,7 +62,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
++    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index 797ef975..0cc5a65a 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
+ 
+ function(add_graph_transformer_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     set_ie_threading_interface_for(${TARGET_NAME})
+ 
+@@ -63,7 +63,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
++    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt
+index fa2a4d02..c2ca41cd 100644
+--- a/inference-engine/thirdparty/CMakeLists.txt
++++ b/inference-engine/thirdparty/CMakeLists.txt
+@@ -61,11 +61,11 @@ else()
+     target_include_directories(pugixml INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pugixml/src>")
+ endif()
+ 
+-add_subdirectory(stb_lib)
++#add_subdirectory(stb_lib)
+ add_subdirectory(ade)
+ add_subdirectory(fluid/modules/gapi)
+ 
+-set_target_properties(ade fluid stb_image PROPERTIES FOLDER thirdparty)
++set_target_properties(ade fluid PROPERTIES FOLDER thirdparty)
+ 
+ # developer package
+ 
+diff --git a/inference-engine/thirdparty/mkldnn.cmake b/inference-engine/thirdparty/mkldnn.cmake
+index 0c2e936e..f36e7beb 100644
+--- a/inference-engine/thirdparty/mkldnn.cmake
++++ b/inference-engine/thirdparty/mkldnn.cmake
+@@ -117,7 +117,7 @@ if(WIN32)
+     endif()
+ endif()
+ 
+-add_library(${TARGET} STATIC ${HDR} ${SRC})
++add_library(${TARGET} STATIC EXCLUDE_FROM_ALL ${HDR} ${SRC})
+ set_ie_threading_interface_for(${TARGET})
+ 
+ if(GEMM STREQUAL "OPENBLAS")
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801..380fb468 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
++                #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
+index 4c5a4766..6f5f2535 100644
+--- a/ngraph/core/builder/CMakeLists.txt
++++ b/ngraph/core/builder/CMakeLists.txt
+@@ -28,7 +28,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
+index 2fa49195..ce68fdc8 100644
+--- a/ngraph/core/reference/CMakeLists.txt
++++ b/ngraph/core/reference/CMakeLists.txt
+@@ -28,7 +28,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
+index 766521a1..04240a89 100644
+--- a/openvino/itt/CMakeLists.txt
++++ b/openvino/itt/CMakeLists.txt
+@@ -56,7 +56,7 @@ if(ENABLE_PROFILING_ITT)
+     endif()
+ endif()
+ 
+-add_library(${TARGET_NAME} STATIC ${SOURCES})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+ add_library(openvino::itt ALIAS ${TARGET_NAME})
+ 
diff --git a/platforms/winpack_dldt/2021.2/20201218-dldt-vs-version.patch b/platforms/winpack_dldt/2021.2/20201218-dldt-vs-version.patch
new file mode 100644
index 0000000000..4d60de77e2
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/20201218-dldt-vs-version.patch
@@ -0,0 +1,16 @@
+diff --git a/cmake/vs_version/vs_version.cmake b/cmake/vs_version/vs_version.cmake
+index d857e2e4..453903fd 100644
+--- a/cmake/vs_version/vs_version.cmake
++++ b/cmake/vs_version/vs_version.cmake
+@@ -22,9 +22,9 @@ if(IE_VS_VER_HAS_VERSION)
+ endif()
+ 
+ set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
+-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
++set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
+ set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2020, Intel Corporation")
+-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
++set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
+ 
+ #
+ # ie_add_vs_version_file(NAME <name>
diff --git a/platforms/winpack_dldt/2021.2/build.config.py b/platforms/winpack_dldt/2021.2/build.config.py
new file mode 100644
index 0000000000..ebb0cb1a64
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/build.config.py
@@ -0,0 +1,3 @@
+os.environ['CI_BUILD_NUMBER'] = '2021.2.0-opencv_winpack_dldt'
+
+cmake_vars['ENABLE_V10_SERIALIZE'] = 'ON'
diff --git a/platforms/winpack_dldt/2021.2/patch.config.py b/platforms/winpack_dldt/2021.2/patch.config.py
new file mode 100644
index 0000000000..76b16544db
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/patch.config.py
@@ -0,0 +1,4 @@
+applyPatch('20201217-dldt-disable-unused-targets.patch')
+applyPatch('20200413-dldt-pdb.patch')
+applyPatch('20200604-dldt-disable-multidevice.patch')
+applyPatch('20201218-dldt-vs-version.patch')
diff --git a/platforms/winpack_dldt/2021.2/sysroot.config.py b/platforms/winpack_dldt/2021.2/sysroot.config.py
new file mode 100644
index 0000000000..0862e1e08e
--- /dev/null
+++ b/platforms/winpack_dldt/2021.2/sysroot.config.py
@@ -0,0 +1,54 @@
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copy_bin('pcie-ma2x8x.elf')
+copy_bin('usb-ma2x8x.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(self.build_dir / 'install' / 'lib' / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(self.build_dir / 'install' / 'lib' / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
diff --git a/platforms/winpack_dldt/2021.3/20200604-dldt-disable-multidevice.patch b/platforms/winpack_dldt/2021.3/20200604-dldt-disable-multidevice.patch
new file mode 100644
index 0000000000..b4d1ef9bfe
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/20200604-dldt-disable-multidevice.patch
@@ -0,0 +1,13 @@
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
++++ b/inference-engine/src/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(multi_device)
++#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
diff --git a/platforms/winpack_dldt/2021.3/20210324-dldt-disable-unused-targets.patch b/platforms/winpack_dldt/2021.3/20210324-dldt-disable-unused-targets.patch
new file mode 100644
index 0000000000..2b70be76e9
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/20210324-dldt-disable-unused-targets.patch
@@ -0,0 +1,219 @@
+diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
+index b081a6945..5468f09f0 100644
+--- a/cmake/developer_package/add_ie_target.cmake
++++ b/cmake/developer_package/add_ie_target.cmake
+@@ -91,7 +91,7 @@ function(addIeTarget)
+     if (ARG_TYPE STREQUAL EXECUTABLE)
+         add_executable(${ARG_NAME} ${all_sources})
+     elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
+-        add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
++        add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
+     else()
+         message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
+     endif()
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index 95c657222..3ab53f854 100644
+--- a/inference-engine/CMakeLists.txt
++++ b/inference-engine/CMakeLists.txt
+@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
++#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -58,7 +58,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
++#ie_build_samples()
+ 
+ if (ENABLE_PYTHON)
+     add_subdirectory(ie_bridges/python)
+@@ -138,7 +138,7 @@ endif()
+ # Developer package
+ #
+ 
+-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader)
++#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader)
+ openvino_developer_export_targets(COMPONENT ngraph TARGETS ${NGRAPH_LIBRARIES})
+ 
+ # for Template plugin
+@@ -146,7 +146,7 @@ if(NGRAPH_INTERPRETER_ENABLE)
+     openvino_developer_export_targets(COMPONENT ngraph TARGETS ngraph_backend interpreter_backend)
+ endif()
+ 
+-ie_developer_export()
++#ie_developer_export()
+ 
+ configure_file(
+     "${IE_MAIN_SOURCE_DIR}/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in"
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index 1ea322763..b5c25837d 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
++++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -95,7 +95,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -156,7 +156,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             ${IE_STATIC_DEPENDENT_FILES})
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index 66498fdbd..4a6c7f619 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
++++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index 73c7ba9f9..e8cf8d9f9 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -78,7 +78,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index 973fafcbf..d886d6aa4 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
++++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -101,7 +101,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index 5c31c9a7a..adb170a5f 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
++++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+ 
+ function(add_common_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     ie_faster_build(${TARGET_NAME}
+         UNITY
+@@ -62,7 +62,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
++    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index 97bd4caa9..0f49ed144 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
+ 
+ function(add_graph_transformer_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     set_ie_threading_interface_for(${TARGET_NAME})
+ 
+@@ -63,7 +63,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
++    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801a..5a17fa3f7 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
++                #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
+index 13b31ee17..be613b65f 100644
+--- a/ngraph/core/builder/CMakeLists.txt
++++ b/ngraph/core/builder/CMakeLists.txt
+@@ -28,7 +28,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
+index 62749a650..dc857f853 100644
+--- a/ngraph/core/reference/CMakeLists.txt
++++ b/ngraph/core/reference/CMakeLists.txt
+@@ -28,7 +28,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
+index 648ac0a05..4291740d7 100644
+--- a/openvino/itt/CMakeLists.txt
++++ b/openvino/itt/CMakeLists.txt
+@@ -18,7 +18,7 @@ set(TARGET_NAME itt)
+ 
+ file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
+ 
+-add_library(${TARGET_NAME} STATIC ${SOURCES})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+ add_library(openvino::itt ALIAS ${TARGET_NAME})
+ 
diff --git a/platforms/winpack_dldt/2021.3/20210324-dldt-pdb.patch b/platforms/winpack_dldt/2021.3/20210324-dldt-pdb.patch
new file mode 100644
index 0000000000..87de5ae6cc
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/20210324-dldt-pdb.patch
@@ -0,0 +1,15 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 1f981ed2..90eb500a 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -22,6 +22,10 @@ endif()
+ 
+ project(OpenVINO)
+ 
++set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
++set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ list(APPEND CMAKE_MODULE_PATH "${OpenVINO_MAIN_SOURCE_DIR}/cmake")
diff --git a/platforms/winpack_dldt/2021.3/20210324-dldt-vs-version.patch b/platforms/winpack_dldt/2021.3/20210324-dldt-vs-version.patch
new file mode 100644
index 0000000000..989e2d0a17
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/20210324-dldt-vs-version.patch
@@ -0,0 +1,16 @@
+diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
+index d857e2e4..453903fd 100644
+--- a/cmake/developer_package/vs_version/vs_version.cmake
++++ b/cmake/developer_package/vs_version/vs_version.cmake
+@@ -22,9 +22,9 @@ if(IE_VS_VER_HAS_VERSION)
+ endif()
+ 
+ set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
+-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
++set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
+ set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2020, Intel Corporation")
+-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
++set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
+ 
+ #
+ # ie_add_vs_version_file(NAME <name>
diff --git a/platforms/winpack_dldt/2021.3/build.config.py b/platforms/winpack_dldt/2021.3/build.config.py
new file mode 100644
index 0000000000..bfaaea8ddf
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/build.config.py
@@ -0,0 +1,3 @@
+os.environ['CI_BUILD_NUMBER'] = '2021.3.0-opencv_winpack_dldt'
+
+cmake_vars['ENABLE_V10_SERIALIZE'] = 'ON'
diff --git a/platforms/winpack_dldt/2021.3/patch.config.py b/platforms/winpack_dldt/2021.3/patch.config.py
new file mode 100644
index 0000000000..341f7a8346
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/patch.config.py
@@ -0,0 +1,4 @@
+applyPatch('20210324-dldt-disable-unused-targets.patch')
+applyPatch('20210324-dldt-pdb.patch')
+applyPatch('20200604-dldt-disable-multidevice.patch')
+applyPatch('20210324-dldt-vs-version.patch')
diff --git a/platforms/winpack_dldt/2021.3/sysroot.config.py b/platforms/winpack_dldt/2021.3/sysroot.config.py
new file mode 100644
index 0000000000..0e1ec8eca6
--- /dev/null
+++ b/platforms/winpack_dldt/2021.3/sysroot.config.py
@@ -0,0 +1,55 @@
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+copy_dll('inference_engine_ir_v7_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copy_bin('pcie-ma2x8x.elf')
+copy_bin('usb-ma2x8x.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py
index 05991da6b4..c3f835cac3 100644
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@@ -151,6 +151,9 @@ def git_apply_patch(src_dir, patch_file):
     assert os.path.exists(patch_file), patch_file
     execute(cmd=['git', 'apply', '--3way', '-v', '--ignore-space-change', str(patch_file)], cwd=src_dir)
     execute(cmd=['git', '--no-pager', 'diff', 'HEAD'], cwd=src_dir)
+    os.environ['GIT_AUTHOR_NAME'] = os.environ['GIT_COMMITTER_NAME']='build'
+    os.environ['GIT_AUTHOR_EMAIL'] = os.environ['GIT_COMMITTER_EMAIL']='build@opencv.org'
+    execute(cmd=['git', 'commit', '-am', 'apply opencv patch'], cwd=src_dir)
 
 
 #===================================================================================================
@@ -278,6 +281,13 @@ class BuilderDLDT:
             OUTPUT_ROOT=str(self.build_dir),  # 2020.4+
         )
 
+        self.build_config_file = str(self.cpath / 'build.config.py')  # Python 3.5 may not handle Path
+        if os.path.exists(str(self.build_config_file)):
+            with open(self.build_config_file, 'r') as f:
+                cfg = f.read()
+            exec(compile(cfg, str(self.build_config_file), 'exec'))
+            log.info('DLDT processed build configuration script')
+
         cmd += [ '-D%s=%s' % (k, v) for (k, v) in cmake_vars.items() if v is not None]
         if self.config.cmake_option_dldt:
             cmd += self.config.cmake_option_dldt
@@ -290,7 +300,9 @@ class BuilderDLDT:
 
             # build
             cmd = [self.cmake_path, '--build', '.', '--config', build_config, # '--target', 'install',
-                    '--', '/v:n', '/m:2', '/consoleloggerparameters:NoSummary'
+                    '--',
+                    # '/m:2' is removed, not properly supported by 2021.3
+                    '/v:n', '/consoleloggerparameters:NoSummary',
             ]
             execute(cmd, cwd=build_dir)
 
@@ -372,9 +384,18 @@ class Builder:
         )
 
         cmake_vars['INF_ENGINE_LIB_DIRS:PATH'] = str(builderDLDT.sysrootdir / 'deployment_tools/inference_engine/lib/intel64')
+        assert os.path.exists(cmake_vars['INF_ENGINE_LIB_DIRS:PATH']), cmake_vars['INF_ENGINE_LIB_DIRS:PATH']
         cmake_vars['INF_ENGINE_INCLUDE_DIRS:PATH'] = str(builderDLDT.sysrootdir / 'deployment_tools/inference_engine/include')
-        cmake_vars['ngraph_DIR:PATH'] = str(builderDLDT.sysrootdir / 'ngraph/cmake')
+        assert os.path.exists(cmake_vars['INF_ENGINE_INCLUDE_DIRS:PATH']), cmake_vars['INF_ENGINE_INCLUDE_DIRS:PATH']
+
+        ngraph_DIR = str(builderDLDT.sysrootdir / 'ngraph/cmake')
+        if not os.path.exists(ngraph_DIR):
+            ngraph_DIR = str(builderDLDT.sysrootdir / 'ngraph/deployment_tools/ngraph/cmake')
+        assert os.path.exists(ngraph_DIR), ngraph_DIR
+        cmake_vars['ngraph_DIR:PATH'] = ngraph_DIR
+
         cmake_vars['TBB_DIR:PATH'] = str(builderDLDT.sysrootdir / 'tbb/cmake')
+        assert os.path.exists(cmake_vars['TBB_DIR:PATH']), cmake_vars['TBB_DIR:PATH']
 
         if self.config.build_debug:
             cmake_vars['CMAKE_BUILD_TYPE'] = 'Debug'
@@ -445,8 +466,8 @@ class Builder:
 def main():
 
     dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
-    dldt_src_commit = '2021.1'
-    dldt_release = '2021010000'
+    dldt_src_commit = '2021.3'
+    dldt_release = '2021030000'
 
     build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
     build_subst_drive = os.environ.get('BUILD_SUBST_DRIVE', None)
@@ -504,8 +525,12 @@ def main():
         args.opencv_dir = os.path.abspath(args.opencv_dir)
 
     if not args.dldt_config:
-        if args.dldt_src_commit == 'releases/2020/4' or args.dldt_src_branch == 'releases/2020/4':
-            args.dldt_config = '2020.4'
+        if str(args.dldt_src_commit).startswith('releases/20'):  # releases/2020/4
+            args.dldt_config = str(args.dldt_src_commit)[len('releases/'):].replace('/', '.')
+            if not args.dldt_src_branch:
+                args.dldt_src_branch = args.dldt_src_commit
+        elif str(args.dldt_src_branch).startswith('releases/20'):  # releases/2020/4
+            args.dldt_config = str(args.dldt_src_branch)[len('releases/'):].replace('/', '.')
         else:
             args.dldt_config = args.dldt_src_commit
 
diff --git a/samples/_winpack_run_python_sample.cmd b/samples/_winpack_run_python_sample.cmd
index 8b4b18fa2e..a7cabe87f8 100644
--- a/samples/_winpack_run_python_sample.cmd
+++ b/samples/_winpack_run_python_sample.cmd
@@ -23,6 +23,8 @@ IF %ERRORLEVEL% EQU 0 (
   GOTO :PYTHON_FOUND
 )
 
+CALL :QUERY_PYTHON 3.9
+IF %ERRORLEVEL% EQU 0 GOTO :PYTHON_FOUND
 CALL :QUERY_PYTHON 3.8
 IF %ERRORLEVEL% EQU 0 GOTO :PYTHON_FOUND
 CALL :QUERY_PYTHON 3.7
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index baef445ace..dc6ab9bac2 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -25,6 +25,9 @@ if(NOT BUILD_EXAMPLES OR NOT OCV_DEPENDENCIES_FOUND)
   return()
 endif()
 
+set(DEPS_example_snippet_imgproc_segmentation opencv_core opencv_imgproc)
+set(DEPS_example_cpp_intelligent_scissors opencv_core opencv_imgproc opencv_imgcodecs opencv_highgui)
+
 project(cpp_samples)
 ocv_include_modules_recurse(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 file(GLOB_RECURSE cpp_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
@@ -32,13 +35,20 @@ if(NOT HAVE_opencv_cudaarithm OR NOT HAVE_opencv_cudafilters)
   ocv_list_filterout(cpp_samples "/gpu/")
 endif()
 ocv_list_filterout(cpp_samples "real_time_pose_estimation/")
+ocv_list_filterout(cpp_samples "parallel_backend/")
 foreach(sample_filename ${cpp_samples})
   set(package "cpp")
-  if(sample_filename MATCHES "tutorial_code")
+  if(sample_filename MATCHES "tutorial_code/snippet")
+    set(package "snippet")
+  elseif(sample_filename MATCHES "tutorial_code")
     set(package "tutorial")
   endif()
   ocv_define_sample(tgt ${sample_filename} ${package})
-  ocv_target_link_libraries(${tgt} PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
+  set(deps ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
+  if(DEFINED DEPS_${tgt})
+    set(deps ${DEPS_${tgt}})
+  endif()
+  ocv_target_link_libraries(${tgt} PRIVATE ${OPENCV_LINKER_LIBS} ${deps})
   if(sample_filename MATCHES "/gpu/" AND HAVE_opencv_cudaarithm AND HAVE_opencv_cuda_filters)
     ocv_target_link_libraries(${tgt} PRIVATE opencv_cudaarithm opencv_cudafilters)
   endif()
@@ -59,3 +69,11 @@ foreach(sample_filename ${cpp_samples})
 endforeach()
 
 include("tutorial_code/calib3d/real_time_pose_estimation/CMakeLists.txt" OPTIONAL)
+
+# Standalone samples only
+if(OpenCV_FOUND AND NOT CMAKE_VERSION VERSION_LESS "3.1")
+  add_subdirectory("example_cmake")
+endif()
+if(OpenCV_FOUND AND NOT CMAKE_VERSION VERSION_LESS "3.9")
+  add_subdirectory("tutorial_code/core/parallel_backend")
+endif()
diff --git a/samples/cpp/essential_mat_reconstr.cpp b/samples/cpp/essential_mat_reconstr.cpp
index 87f0840451..b93bdb29d6 100644
--- a/samples/cpp/essential_mat_reconstr.cpp
+++ b/samples/cpp/essential_mat_reconstr.cpp
@@ -81,7 +81,7 @@ static void getPlanes (InputArray points3d_, std::vector<int> &labels, std::vect
             double xx = 0, yy = 0, zz = 0, xy = 0, xz = 0, yz = 0;
             for (int s : sample) {
                 const double x_ = points[3*s] - c_x, y_ = points[3*s+1] - c_y, z_ = points[3*s+2] - c_z;
-                xx += x_*x_; yy += y_*y_; zz += z_*z_; xy = x_*y_; yz += y_*z_; xz += x_*z_;
+                xx += x_*x_; yy += y_*y_; zz += z_*z_; xy += x_*y_; yz += y_*z_; xz += x_*z_;
             }
             xx /= n; yy /= n; zz /= n; xy /= n; yz /= n; xz /= n;
             Vec3d weighted_normal(0,0,0);
@@ -340,4 +340,4 @@ int main(int args, char** argv) {
     imshow("image 1-2", image1);
     imwrite("planes.png", image1);
     waitKey(0);
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/logistic_regression.cpp b/samples/cpp/logistic_regression.cpp
index 4338b61f7b..1bc2bf9711 100644
--- a/samples/cpp/logistic_regression.cpp
+++ b/samples/cpp/logistic_regression.cpp
@@ -1,60 +1,5 @@
-/*//////////////////////////////////////////////////////////////////////////////////////
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-
-// This is a implementation of the Logistic Regression algorithm in C++ in OpenCV.
-
-// AUTHOR:
-// Rahul Kavi rahulkavi[at]live[at]com
-//
-
-// contains a subset of data from the popular Iris Dataset (taken from
-// "http://archive.ics.uci.edu/ml/datasets/Iris")
-
-// # You are free to use, change, or redistribute the code in any way you wish for
-// # non-commercial purposes, but please maintain the name of the original author.
-// # This code comes with no warranty of any kind.
-
-// #
-// # You are free to use, change, or redistribute the code in any way you wish for
-// # non-commercial purposes, but please maintain the name of the original author.
-// # This code comes with no warranty of any kind.
-
-// # Logistic Regression ALGORITHM
-
-//                           License Agreement
-//                For Open Source Computer Vision Library
-
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.*/
+// Logistic Regression sample
+// AUTHOR: Rahul Kavi rahulkavi[at]live[at]com
 
 #include <iostream>
 
diff --git a/samples/cpp/stereo_calib.cpp b/samples/cpp/stereo_calib.cpp
index 6079be033e..b2c495200f 100644
--- a/samples/cpp/stereo_calib.cpp
+++ b/samples/cpp/stereo_calib.cpp
@@ -17,7 +17,6 @@
    OPENCV WEBSITES:
      Homepage:      http://opencv.org
      Online docs:   http://docs.opencv.org
-     Q&A forum:     http://answers.opencv.org
      GitHub:        https://github.com/opencv/opencv/
    ************************************************** */
 
diff --git a/samples/cpp/tutorial_code/ImgTrans/imageSegmentation.cpp b/samples/cpp/tutorial_code/ImgTrans/imageSegmentation.cpp
index 201466e99b..818fcb4735 100644
--- a/samples/cpp/tutorial_code/ImgTrans/imageSegmentation.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/imageSegmentation.cpp
@@ -24,23 +24,16 @@ int main(int argc, char *argv[])
         return -1;
     }
 
-    // Show source image
+    // Show the source image
     imshow("Source Image", src);
     //! [load_image]
 
     //! [black_bg]
     // Change the background from white to black, since that will help later to extract
     // better results during the use of Distance Transform
-    for ( int i = 0; i < src.rows; i++ ) {
-        for ( int j = 0; j < src.cols; j++ ) {
-            if ( src.at<Vec3b>(i, j) == Vec3b(255,255,255) )
-            {
-                src.at<Vec3b>(i, j)[0] = 0;
-                src.at<Vec3b>(i, j)[1] = 0;
-                src.at<Vec3b>(i, j)[2] = 0;
-            }
-        }
-    }
+    Mat mask;
+    inRange(src, Scalar(255, 255, 255), Scalar(255, 255, 255), mask);
+    src.setTo(Scalar(0, 0, 0), mask);
 
     // Show output image
     imshow("Black Background Image", src);
@@ -124,7 +117,9 @@ int main(int argc, char *argv[])
 
     // Draw the background marker
     circle(markers, Point(5,5), 3, Scalar(255), -1);
-    imshow("Markers", markers*10000);
+    Mat markers8u;
+    markers.convertTo(markers8u, CV_8U, 10);
+    imshow("Markers", markers8u);
     //! [seeds]
 
     //! [watershed]
diff --git a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
index 9e1ca42c20..c9bf60bdec 100644
--- a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
@@ -419,7 +419,12 @@ int main(int argc, char* argv[])
         {
             Mat temp = view.clone();
             if (s.useFisheye)
-              cv::fisheye::undistortImage(temp, view, cameraMatrix, distCoeffs);
+            {
+                Mat newCamMat;
+                fisheye::estimateNewCameraMatrixForUndistortRectify(cameraMatrix, distCoeffs, imageSize,
+                                                                    Matx33d::eye(), newCamMat, 1);
+                cv::fisheye::undistortImage(temp, view, cameraMatrix, distCoeffs, newCamMat);
+            }
             else
               undistort(temp, view, cameraMatrix, distCoeffs);
         }
@@ -445,7 +450,7 @@ int main(int argc, char* argv[])
 
     // -----------------------Show the undistorted image for the image list ------------------------
     //! [show_results]
-    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed )
+    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed && !cameraMatrix.empty())
     {
         Mat view, rview, map1, map2;
 
@@ -548,7 +553,7 @@ static bool runCalibration( Settings& s, Size& imageSize, Mat& cameraMatrix, Mat
 {
     //! [fixed_aspect]
     cameraMatrix = Mat::eye(3, 3, CV_64F);
-    if( s.flag & CALIB_FIX_ASPECT_RATIO )
+    if( !s.useFisheye && s.flag & CALIB_FIX_ASPECT_RATIO )
         cameraMatrix.at<double>(0,0) = s.aspectRatio;
     //! [fixed_aspect]
     if (s.useFisheye) {
@@ -631,7 +636,7 @@ static void saveCameraParams( Settings& s, Size& imageSize, Mat& cameraMatrix, M
     fs << "board_height" << s.boardSize.height;
     fs << "square_size" << s.squareSize;
 
-    if( s.flag & CALIB_FIX_ASPECT_RATIO )
+    if( !s.useFisheye && s.flag & CALIB_FIX_ASPECT_RATIO )
         fs << "fix_aspect_ratio" << s.aspectRatio;
 
     if (s.flag)
diff --git a/samples/cpp/tutorial_code/core/parallel_backend/CMakeLists.txt b/samples/cpp/tutorial_code/core/parallel_backend/CMakeLists.txt
new file mode 100644
index 0000000000..0e67dc29e2
--- /dev/null
+++ b/samples/cpp/tutorial_code/core/parallel_backend/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.9)
+
+find_package(OpenCV REQUIRED COMPONENTS opencv_core)
+
+find_package(OpenMP)
+if(OpenMP_FOUND)
+  project(opencv_example_openmp_backend)
+  add_executable(opencv_example_openmp_backend example-openmp.cpp)
+  target_link_libraries(opencv_example_openmp_backend PRIVATE
+      opencv_core
+      OpenMP::OpenMP_CXX
+  )
+endif()
+
+# TODO: find_package(TBB)
+find_path(TBB_INCLUDE_DIR NAMES "tbb/tbb.h")
+find_library(TBB_LIBRARY NAMES "tbb")
+if(TBB_INCLUDE_DIR AND TBB_LIBRARY AND NOT OPENCV_EXAMPLE_SKIP_TBB)
+  project(opencv_example_tbb_backend)
+  add_executable(opencv_example_tbb_backend example-tbb.cpp)
+  target_include_directories(opencv_example_tbb_backend SYSTEM PRIVATE ${TBB_INCLUDE_DIR})
+  target_link_libraries(opencv_example_tbb_backend PRIVATE
+      opencv_core
+      ${TBB_LIBRARY}
+  )
+endif()
diff --git a/samples/cpp/tutorial_code/core/parallel_backend/example-openmp.cpp b/samples/cpp/tutorial_code/core/parallel_backend/example-openmp.cpp
new file mode 100644
index 0000000000..aa3f143f7b
--- /dev/null
+++ b/samples/cpp/tutorial_code/core/parallel_backend/example-openmp.cpp
@@ -0,0 +1,44 @@
+#include "opencv2/core.hpp"
+#include <iostream>
+
+#include <chrono>
+#include <thread>
+
+//! [openmp_include]
+#include "opencv2/core/parallel/backend/parallel_for.openmp.hpp"
+//! [openmp_include]
+
+namespace cv { // private.hpp
+CV_EXPORTS const char* currentParallelFramework();
+}
+
+static
+std::string currentParallelFrameworkSafe()
+{
+    const char* framework = cv::currentParallelFramework();
+    if (framework)
+        return framework;
+    return std::string();
+}
+
+using namespace cv;
+int main()
+{
+    std::cout << "OpenCV builtin parallel framework: '" << currentParallelFrameworkSafe() << "' (nthreads=" << getNumThreads() << ")" << std::endl;
+
+    //! [openmp_backend]
+    //omp_set_dynamic(1);
+    cv::parallel::setParallelForBackend(std::make_shared<cv::parallel::openmp::ParallelForBackend>());
+    //! [openmp_backend]
+
+    std::cout << "New parallel backend: '" << currentParallelFrameworkSafe() << "'" << "' (nthreads=" << getNumThreads() << ")" << std::endl;
+
+    parallel_for_(Range(0, 20), [&](const Range range)
+    {
+        std::ostringstream out;
+        out << "Thread " << getThreadNum() << "(opencv=" << utils::getThreadID() << "): range " << range.start << "-" << range.end << std::endl;
+        std::cout << out.str() << std::flush;
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    });
+}
diff --git a/samples/cpp/tutorial_code/core/parallel_backend/example-tbb.cpp b/samples/cpp/tutorial_code/core/parallel_backend/example-tbb.cpp
new file mode 100644
index 0000000000..ebf0d260c1
--- /dev/null
+++ b/samples/cpp/tutorial_code/core/parallel_backend/example-tbb.cpp
@@ -0,0 +1,43 @@
+#include "opencv2/core.hpp"
+#include <iostream>
+
+#include <chrono>
+#include <thread>
+
+//! [tbb_include]
+#include "opencv2/core/parallel/backend/parallel_for.tbb.hpp"
+//! [tbb_include]
+
+namespace cv { // private.hpp
+CV_EXPORTS const char* currentParallelFramework();
+}
+
+static
+std::string currentParallelFrameworkSafe()
+{
+    const char* framework = cv::currentParallelFramework();
+    if (framework)
+        return framework;
+    return std::string();
+}
+
+using namespace cv;
+int main()
+{
+    std::cout << "OpenCV builtin parallel framework: '" << currentParallelFrameworkSafe() << "' (nthreads=" << getNumThreads() << ")" << std::endl;
+
+    //! [tbb_backend]
+    cv::parallel::setParallelForBackend(std::make_shared<cv::parallel::tbb::ParallelForBackend>());
+    //! [tbb_backend]
+
+    std::cout << "New parallel backend: '" << currentParallelFrameworkSafe() << "'" << "' (nthreads=" << getNumThreads() << ")" << std::endl;
+
+    parallel_for_(Range(0, 20), [&](const Range range)
+    {
+        std::ostringstream out;
+        out << "Thread " << getThreadNum() << "(opencv=" << utils::getThreadID() << "): range " << range.start << "-" << range.end << std::endl;
+        std::cout << out.str() << std::flush;
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    });
+}
diff --git a/samples/cpp/tutorial_code/snippets/imgproc_segmentation.cpp b/samples/cpp/tutorial_code/snippets/imgproc_segmentation.cpp
new file mode 100644
index 0000000000..b81ba34494
--- /dev/null
+++ b/samples/cpp/tutorial_code/snippets/imgproc_segmentation.cpp
@@ -0,0 +1,35 @@
+#include "opencv2/imgproc.hpp"
+#include "opencv2/imgproc/segmentation.hpp"
+
+using namespace cv;
+
+static
+void usage_example_intelligent_scissors()
+{
+    Mat image(Size(1920, 1080), CV_8UC3, Scalar::all(128));
+
+    //! [usage_example_intelligent_scissors]
+    segmentation::IntelligentScissorsMB tool;
+    tool.setEdgeFeatureCannyParameters(16, 100)  // using Canny() as edge feature extractor
+        .setGradientMagnitudeMaxLimit(200);
+
+    // calculate image features
+    tool.applyImage(image);
+
+    // calculate map for specified source point
+    Point source_point(200, 100);
+    tool.buildMap(source_point);
+
+    // fast fetching of contours
+    // for specified target point and the pre-calculated map (stored internally)
+    Point target_point(400, 300);
+    std::vector<Point> pts;
+    tool.getContour(target_point, pts);
+    //! [usage_example_intelligent_scissors]
+}
+
+int main()
+{
+    usage_example_intelligent_scissors();
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp b/samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp
index a6dc6dd75c..581b1768f1 100644
--- a/samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp
+++ b/samples/cpp/tutorial_code/videoio/orbbec_astra/orbbec_astra.cpp
@@ -25,10 +25,10 @@ struct Frame
 int main()
 {
     //! [Open streams]
-    // Open color stream
-    VideoCapture colorStream(CAP_V4L2);
     // Open depth stream
     VideoCapture depthStream(CAP_OPENNI2_ASTRA);
+    // Open color stream
+    VideoCapture colorStream(0, CAP_V4L2);
     //! [Open streams]
 
     // Check that stream has opened
@@ -69,7 +69,6 @@ int main()
     //! [Read streams]
     // Create two lists to store frames
     std::list<Frame> depthFrames, colorFrames;
-    std::mutex depthFramesMtx, colorFramesMtx;
     const std::size_t maxFrames = 64;
 
     // Synchronization objects
@@ -90,8 +89,6 @@ int main()
                 Frame f;
                 f.timestamp = cv::getTickCount();
                 depthStream.retrieve(f.frame, CAP_OPENNI_DEPTH_MAP);
-                //depthStream.retrieve(f.frame, CAP_OPENNI_DISPARITY_MAP);
-                //depthStream.retrieve(f.frame, CAP_OPENNI_IR_IMAGE);
                 if (f.frame.empty())
                 {
                     cerr << "ERROR: Failed to decode frame from depth stream" << endl;
@@ -99,7 +96,7 @@ int main()
                 }
 
                 {
-                    std::lock_guard<std::mutex> lk(depthFramesMtx);
+                    std::lock_guard<std::mutex> lk(mtx);
                     if (depthFrames.size() >= maxFrames)
                         depthFrames.pop_front();
                     depthFrames.push_back(f);
@@ -127,7 +124,7 @@ int main()
                 }
 
                 {
-                    std::lock_guard<std::mutex> lk(colorFramesMtx);
+                    std::lock_guard<std::mutex> lk(mtx);
                     if (colorFrames.size() >= maxFrames)
                         colorFrames.pop_front();
                     colorFrames.push_back(f);
@@ -138,56 +135,66 @@ int main()
     });
     //! [Read streams]
 
-    while (true)
+    //! [Pair frames]
+    // Pair depth and color frames
+    while (!isFinish)
     {
         std::unique_lock<std::mutex> lk(mtx);
-        while (depthFrames.empty() && colorFrames.empty())
+        while (!isFinish && (depthFrames.empty() || colorFrames.empty()))
             dataReady.wait(lk);
 
-        depthFramesMtx.lock();
-        if (depthFrames.empty())
+        while (!depthFrames.empty() && !colorFrames.empty())
         {
-            depthFramesMtx.unlock();
-        }
-        else
-        {
-            // Get a frame from the list
-            Mat depthMap = depthFrames.front().frame;
-            depthFrames.pop_front();
-            depthFramesMtx.unlock();
+            if (!lk.owns_lock())
+                lk.lock();
 
+            // Get a frame from the list
+            Frame depthFrame = depthFrames.front();
+            int64 depthT = depthFrame.timestamp;
+
+            // Get a frame from the list
+            Frame colorFrame = colorFrames.front();
+            int64 colorT = colorFrame.timestamp;
+
+            // Half of frame period is a maximum time diff between frames
+            const int64 maxTdiff = int64(1000000000 / (2 * colorStream.get(CAP_PROP_FPS)));
+            if (depthT + maxTdiff < colorT)
+            {
+                depthFrames.pop_front();
+                continue;
+            }
+            else if (colorT + maxTdiff < depthT)
+            {
+                colorFrames.pop_front();
+                continue;
+            }
+            depthFrames.pop_front();
+            colorFrames.pop_front();
+            lk.unlock();
+
+            //! [Show frames]
             // Show depth frame
             Mat d8, dColor;
-            depthMap.convertTo(d8, CV_8U, 255.0 / 2500);
+            depthFrame.frame.convertTo(d8, CV_8U, 255.0 / 2500);
             applyColorMap(d8, dColor, COLORMAP_OCEAN);
             imshow("Depth (colored)", dColor);
-        }
-
-        //! [Show color frame]
-        colorFramesMtx.lock();
-        if (colorFrames.empty())
-        {
-            colorFramesMtx.unlock();
-        }
-        else
-        {
-            // Get a frame from the list
-            Mat colorFrame = colorFrames.front().frame;
-            colorFrames.pop_front();
-            colorFramesMtx.unlock();
 
             // Show color frame
-            imshow("Color", colorFrame);
+            imshow("Color", colorFrame.frame);
+            //! [Show frames]
+
+            // Exit on Esc key press
+            int key = waitKey(1);
+            if (key == 27) // ESC
+            {
+                isFinish = true;
+                break;
+            }
         }
-        //! [Show color frame]
-
-        // Exit on Esc key press
-        int key = waitKey(1);
-        if (key == 27) // ESC
-            break;
     }
+    //! [Pair frames]
 
-    isFinish = true;
+    dataReady.notify_one();
     depthReader.join();
     colorReader.join();
 
diff --git a/samples/cpp/videocapture_gstreamer_pipeline.cpp b/samples/cpp/videocapture_gstreamer_pipeline.cpp
index 9507eb3c90..ed9d6fd334 100644
--- a/samples/cpp/videocapture_gstreamer_pipeline.cpp
+++ b/samples/cpp/videocapture_gstreamer_pipeline.cpp
@@ -223,7 +223,7 @@ inline Ptr<VideoWriter> createWriter(const string &backend, const string &file_n
     }
     else if (backend == "ffmpeg")
     {
-        cout << "Created FFMpeg writer ( " << file_name << ", FPS=" << fps << ", Size=" << sz << " )" << endl;
+        cout << "Created FFmpeg writer ( " << file_name << ", FPS=" << fps << ", Size=" << sz << " )" << endl;
         return makePtr<VideoWriter>(file_name, CAP_FFMPEG, getValue(fourccByCodec(), codec, "Invalid codec"), fps, sz, true);
     }
     return Ptr<VideoWriter>();
diff --git a/samples/data/alphabet_36.txt b/samples/data/alphabet_36.txt
new file mode 100644
index 0000000000..7104368905
--- /dev/null
+++ b/samples/data/alphabet_36.txt
@@ -0,0 +1,36 @@
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
diff --git a/samples/data/alphabet_94.txt b/samples/data/alphabet_94.txt
new file mode 100644
index 0000000000..87c6d67850
--- /dev/null
+++ b/samples/data/alphabet_94.txt
@@ -0,0 +1,94 @@
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+{
+|
+}
+~
diff --git a/samples/data/squirrel_cls.jpg b/samples/data/squirrel_cls.jpg
new file mode 100644
index 0000000000..289b13bbd3
Binary files /dev/null and b/samples/data/squirrel_cls.jpg differ
diff --git a/samples/dnn/.gitignore b/samples/dnn/.gitignore
new file mode 100644
index 0000000000..f0beea03ed
--- /dev/null
+++ b/samples/dnn/.gitignore
@@ -0,0 +1,3 @@
+*.caffemodel
+*.pb
+*.weights
\ No newline at end of file
diff --git a/samples/dnn/README.md b/samples/dnn/README.md
index d470e9c6d8..6005fbf715 100644
--- a/samples/dnn/README.md
+++ b/samples/dnn/README.md
@@ -19,6 +19,36 @@ Check `-h` option to know which values are used by default:
 python object_detection.py opencv_fd -h
 ```
 
+### Sample models
+
+You can download sample models using ```download_models.py```. For example, the following command will download network weights for OpenCV Face Detector model and store them in FaceDetector folder:
+
+```bash
+python download_models.py --save_dir FaceDetector opencv_fd
+```
+
+You can use default configuration files adopted for OpenCV from [here](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn).
+
+You also can use the script to download necessary files from your code. Assume you have the following code inside ```your_script.py```:
+
+```python
+from download_models import downloadFile
+
+filepath1 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", None, filename="MobileNetSSD_deploy.caffemodel", save_dir="save_dir_1")
+filepath2 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a", filename="MobileNetSSD_deploy.caffemodel")
+print(filepath1)
+print(filepath2)
+# Your code
+```
+
+By running the following commands, you will get **MobileNetSSD_deploy.caffemodel** file:
+```bash
+export OPENCV_DOWNLOAD_DATA_PATH=download_folder
+python your_script.py
+```
+
+**Note** that you can provide a directory using **save_dir** parameter or via **OPENCV_SAVE_DIR** environment variable.
+
 #### Face detection
 [An origin model](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector)
 with single precision floating point weights has been quantized using [TensorFlow framework](https://www.tensorflow.org/).
@@ -48,7 +78,7 @@ AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.528     | 0.528          |
 ```
 
 ## References
-* [Models downloading script](https://github.com/opencv/opencv_extra/blob/master/testdata/dnn/download_models.py)
+* [Models downloading script](https://github.com/opencv/opencv/samples/dnn/download_models.py)
 * [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn)
 * [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
 * [Names of classes from different datasets](https://github.com/opencv/opencv/tree/master/samples/data/dnn)
diff --git a/samples/dnn/classification.cpp b/samples/dnn/classification.cpp
index 0ae9e6ed94..8440371688 100644
--- a/samples/dnn/classification.cpp
+++ b/samples/dnn/classification.cpp
@@ -8,22 +8,26 @@
 #include "common.hpp"
 
 std::string keys =
-    "{ help  h     | | Print help message. }"
-    "{ @alias      | | An alias name of model to extract preprocessing parameters from models.yml file. }"
-    "{ zoo         | models.yml | An optional path to file with preprocessing parameters }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
-    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
-    "{ classes     | | Optional path to a text file with names of classes. }"
-    "{ backend     | 0 | Choose one of computation backends: "
-                        "0: automatically (by default), "
-                        "1: Halide language (http://halide-lang.org/), "
-                        "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                        "3: OpenCV implementation }"
-    "{ target      | 0 | Choose one of target computation devices: "
-                        "0: CPU target (by default), "
-                        "1: OpenCL, "
-                        "2: OpenCL fp16 (half-float precision), "
-                        "3: VPU }";
+    "{ help  h          | | Print help message. }"
+    "{ @alias           | | An alias name of model to extract preprocessing parameters from models.yml file. }"
+    "{ zoo              | models.yml | An optional path to file with preprocessing parameters }"
+    "{ input i          | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ initial_width    | 0 | Preprocess input image by initial resizing to a specific width.}"
+    "{ initial_height   | 0 | Preprocess input image by initial resizing to a specific height.}"
+    "{ std              | 0.0 0.0 0.0 | Preprocess input image by dividing on a standard deviation.}"
+    "{ crop             | false | Preprocess input image by center cropping.}"
+    "{ framework f      | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
+    "{ classes          | | Optional path to a text file with names of classes. }"
+    "{ backend          | 0 | Choose one of computation backends: "
+                            "0: automatically (by default), "
+                            "1: Halide language (http://halide-lang.org/), "
+                            "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                            "3: OpenCV implementation }"
+    "{ target           | 0 | Choose one of target computation devices: "
+                            "0: CPU target (by default), "
+                            "1: OpenCL, "
+                            "2: OpenCL fp16 (half-float precision), "
+                            "3: VPU }";
 
 using namespace cv;
 using namespace dnn;
@@ -47,9 +51,13 @@ int main(int argc, char** argv)
         return 0;
     }
 
+    int rszWidth = parser.get<int>("initial_width");
+    int rszHeight = parser.get<int>("initial_height");
     float scale = parser.get<float>("scale");
     Scalar mean = parser.get<Scalar>("mean");
+    Scalar std = parser.get<Scalar>("std");
     bool swapRB = parser.get<bool>("rgb");
+    bool crop = parser.get<bool>("crop");
     int inpWidth = parser.get<int>("width");
     int inpHeight = parser.get<int>("height");
     String model = findFile(parser.get<String>("model"));
@@ -108,8 +116,20 @@ int main(int argc, char** argv)
             break;
         }
 
+        if (rszWidth != 0 && rszHeight != 0)
+        {
+            resize(frame, frame, Size(rszWidth, rszHeight));
+        }
+
         //! [Create a 4D blob from a frame]
-        blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
+        blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
+
+        // Check std values.
+        if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
+        {
+            // Divide blob by std.
+            divide(blob, std, blob);
+        }
         //! [Create a 4D blob from a frame]
 
         //! [Set input blob]
diff --git a/samples/dnn/classification.py b/samples/dnn/classification.py
index 1c6908a2bc..558c8b0bdc 100644
--- a/samples/dnn/classification.py
+++ b/samples/dnn/classification.py
@@ -1,85 +1,112 @@
-import cv2 as cv
 import argparse
-import numpy as np
 
+import cv2 as cv
+import numpy as np
 from common import *
 
-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
 
-parser = argparse.ArgumentParser(add_help=False)
-parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
-                    help='An optional path to file with preprocessing parameters.')
-parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
-parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
-                    help='Optional name of an origin framework of the model. '
-                         'Detect it automatically if it does not set.')
-parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
-                    help="Choose one of computation backends: "
-                         "%d: automatically (by default), "
-                         "%d: Halide language (http://halide-lang.org/), "
-                         "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                         "%d: OpenCV implementation" % backends)
-parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
-                    help='Choose one of target computation devices: '
-                         '%d: CPU target (by default), '
-                         '%d: OpenCL, '
-                         '%d: OpenCL fp16 (half-float precision), '
-                         '%d: NCS2 VPU, '
-                         '%d: HDDL VPU' % targets)
-args, _ = parser.parse_known_args()
-add_preproc_args(args.zoo, parser, 'classification')
-parser = argparse.ArgumentParser(parents=[parser],
-                                 description='Use this script to run classification deep learning networks using OpenCV.',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-args = parser.parse_args()
+def get_args_parser(func_args):
+    backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
+                cv.dnn.DNN_BACKEND_OPENCV)
+    targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
+               cv.dnn.DNN_TARGET_HDDL)
 
-args.model = findFile(args.model)
-args.config = findFile(args.config)
-args.classes = findFile(args.classes)
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
+                        help='An optional path to file with preprocessing parameters.')
+    parser.add_argument('--input',
+                        help='Path to input image or video file. Skip this argument to capture frames from a camera.')
+    parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
+                        help='Optional name of an origin framework of the model. '
+                             'Detect it automatically if it does not set.')
+    parser.add_argument('--std', nargs='*', type=float,
+                        help='Preprocess input image by dividing on a standard deviation.')
+    parser.add_argument('--crop', type=bool, default=False,
+                        help='Preprocess input image by dividing on a standard deviation.')
+    parser.add_argument('--initial_width', type=int,
+                        help='Preprocess input image by initial resizing to a specific width.')
+    parser.add_argument('--initial_height', type=int,
+                        help='Preprocess input image by initial resizing to a specific height.')
+    parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
+                        help="Choose one of computation backends: "
+                             "%d: automatically (by default), "
+                             "%d: Halide language (http://halide-lang.org/), "
+                             "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                             "%d: OpenCV implementation" % backends)
+    parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
+                        help='Choose one of target computation devices: '
+                             '%d: CPU target (by default), '
+                             '%d: OpenCL, '
+                             '%d: OpenCL fp16 (half-float precision), '
+                             '%d: NCS2 VPU, '
+                             '%d: HDDL VPU' % targets)
 
-# Load names of classes
-classes = None
-if args.classes:
-    with open(args.classes, 'rt') as f:
-        classes = f.read().rstrip('\n').split('\n')
+    args, _ = parser.parse_known_args()
+    add_preproc_args(args.zoo, parser, 'classification')
+    parser = argparse.ArgumentParser(parents=[parser],
+                                     description='Use this script to run classification deep learning networks using OpenCV.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    return parser.parse_args(func_args)
 
-# Load a network
-net = cv.dnn.readNet(args.model, args.config, args.framework)
-net.setPreferableBackend(args.backend)
-net.setPreferableTarget(args.target)
 
-winName = 'Deep learning image classification in OpenCV'
-cv.namedWindow(winName, cv.WINDOW_NORMAL)
+def main(func_args=None):
+    args = get_args_parser(func_args)
+    args.model = findFile(args.model)
+    args.config = findFile(args.config)
+    args.classes = findFile(args.classes)
 
-cap = cv.VideoCapture(args.input if args.input else 0)
-while cv.waitKey(1) < 0:
-    hasFrame, frame = cap.read()
-    if not hasFrame:
-        cv.waitKey()
-        break
+    # Load names of classes
+    classes = None
+    if args.classes:
+        with open(args.classes, 'rt') as f:
+            classes = f.read().rstrip('\n').split('\n')
 
-    # Create a 4D blob from a frame.
-    inpWidth = args.width if args.width else frame.shape[1]
-    inpHeight = args.height if args.height else frame.shape[0]
-    blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
+    # Load a network
+    net = cv.dnn.readNet(args.model, args.config, args.framework)
+    net.setPreferableBackend(args.backend)
+    net.setPreferableTarget(args.target)
 
-    # Run a model
-    net.setInput(blob)
-    out = net.forward()
+    winName = 'Deep learning image classification in OpenCV'
+    cv.namedWindow(winName, cv.WINDOW_NORMAL)
 
-    # Get a class with a highest score.
-    out = out.flatten()
-    classId = np.argmax(out)
-    confidence = out[classId]
+    cap = cv.VideoCapture(args.input if args.input else 0)
+    while cv.waitKey(1) < 0:
+        hasFrame, frame = cap.read()
+        if not hasFrame:
+            cv.waitKey()
+            break
 
-    # Put efficiency information.
-    t, _ = net.getPerfProfile()
-    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
-    cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+        # Create a 4D blob from a frame.
+        inpWidth = args.width if args.width else frame.shape[1]
+        inpHeight = args.height if args.height else frame.shape[0]
 
-    # Print predicted class.
-    label = '%s: %.4f' % (classes[classId] if classes else 'Class #%d' % classId, confidence)
-    cv.putText(frame, label, (0, 40), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+        if args.initial_width and args.initial_height:
+            frame = cv.resize(frame, (args.initial_width, args.initial_height))
 
-    cv.imshow(winName, frame)
+        blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=args.crop)
+        if args.std:
+            blob[0] /= np.asarray(args.std, dtype=np.float32).reshape(3, 1, 1)
+
+        # Run a model
+        net.setInput(blob)
+        out = net.forward()
+
+        # Get a class with a highest score.
+        out = out.flatten()
+        classId = np.argmax(out)
+        confidence = out[classId]
+
+        # Put efficiency information.
+        t, _ = net.getPerfProfile()
+        label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
+        cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+
+        # Print predicted class.
+        label = '%s: %.4f' % (classes[classId] if classes else 'Class #%d' % classId, confidence)
+        cv.putText(frame, label, (0, 40), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+
+        cv.imshow(winName, frame)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dasiamrpn_tracker.cpp b/samples/dnn/dasiamrpn_tracker.cpp
new file mode 100644
index 0000000000..0008cee255
--- /dev/null
+++ b/samples/dnn/dasiamrpn_tracker.cpp
@@ -0,0 +1,518 @@
+// DaSiamRPN tracker.
+// Original paper: https://arxiv.org/abs/1808.06048
+// Link to original repo: https://github.com/foolwood/DaSiamRPN
+// Links to onnx models:
+// - network:     https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
+// - kernel_r1:   https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
+// - kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
+
+#include <iostream>
+#include <cmath>
+
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+const char *keys =
+        "{ help     h  |   | Print help message }"
+        "{ input    i  |   | Full path to input video folder, the specific camera index. (empty for camera 0) }"
+        "{ net         | dasiamrpn_model.onnx | Path to onnx model of net}"
+        "{ kernel_cls1 | dasiamrpn_kernel_cls1.onnx | Path to onnx model of kernel_r1 }"
+        "{ kernel_r1   | dasiamrpn_kernel_r1.onnx | Path to onnx model of kernel_cls1 }"
+        "{ backend     | 0 | Choose one of computation backends: "
+                            "0: automatically (by default), "
+                            "1: Halide language (http://halide-lang.org/), "
+                            "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                            "3: OpenCV implementation }"
+        "{ target      | 0 | Choose one of target computation devices: "
+                            "0: CPU target (by default), "
+                            "1: OpenCL, "
+                            "2: OpenCL fp16 (half-float precision), "
+                            "3: VPU }"
+;
+
+// Initial parameters of the model
+struct trackerConfig
+{
+    float windowInfluence = 0.43f;
+    float lr = 0.4f;
+    int scale = 8;
+    bool swapRB = false;
+    int totalStride = 8;
+    float penaltyK = 0.055f;
+    int exemplarSize = 127;
+    int instanceSize = 271;
+    float contextAmount = 0.5f;
+    std::vector<float> ratios = { 0.33f, 0.5f, 1.0f, 2.0f, 3.0f };
+    int anchorNum = int(ratios.size());
+    Mat anchors;
+    Mat windows;
+    Scalar avgChans;
+    Size imgSize = { 0, 0 };
+    Rect2f targetBox = { 0, 0, 0, 0 };
+    int scoreSize = (instanceSize - exemplarSize) / totalStride + 1;
+
+    void update_scoreSize()
+    {
+        scoreSize = int((instanceSize - exemplarSize) / totalStride + 1);
+    }
+};
+
+static void softmax(const Mat& src, Mat& dst);
+static void elementMax(Mat& src);
+static Mat generateHanningWindow(const trackerConfig& trackState);
+static Mat generateAnchors(trackerConfig& trackState);
+static Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans);
+static float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN);
+static void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1);
+
+template <typename T> static
+T sizeCal(const T& w, const T& h)
+{
+    T pad = (w + h) * T(0.5);
+    T sz2 = (w + pad) * (h + pad);
+    return sqrt(sz2);
+}
+
+template <>
+Mat sizeCal(const Mat& w, const Mat& h)
+{
+    Mat pad = (w + h) * 0.5;
+    Mat sz2 = (w + pad).mul((h + pad));
+
+    cv::sqrt(sz2, sz2);
+    return sz2;
+}
+
+static
+int run(int argc, char** argv)
+{
+    // Parse command line arguments.
+    CommandLineParser parser(argc, argv, keys);
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    std::string inputName = parser.get<String>("input");
+    std::string net = parser.get<String>("net");
+    std::string kernel_cls1 = parser.get<String>("kernel_cls1");
+    std::string kernel_r1 = parser.get<String>("kernel_r1");
+    int backend = parser.get<int>("backend");
+    int target = parser.get<int>("target");
+
+    // Read nets.
+    Net siamRPN, siamKernelCL1, siamKernelR1;
+    try
+    {
+        siamRPN = readNet(samples::findFile(net));
+        siamKernelCL1 = readNet(samples::findFile(kernel_cls1));
+        siamKernelR1 = readNet(samples::findFile(kernel_r1));
+    }
+    catch (const cv::Exception& ee)
+    {
+        std::cerr << "Exception: " << ee.what() << std::endl;
+        std::cout << "Can't load the network by using the following files:" << std::endl;
+        std::cout << "siamRPN : " << net << std::endl;
+        std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
+        std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
+        return 2;
+    }
+
+    // Set model backend.
+    siamRPN.setPreferableBackend(backend);
+    siamRPN.setPreferableTarget(target);
+    siamKernelR1.setPreferableBackend(backend);
+    siamKernelR1.setPreferableTarget(target);
+    siamKernelCL1.setPreferableBackend(backend);
+    siamKernelCL1.setPreferableTarget(target);
+
+    const std::string winName = "DaSiamRPN";
+    namedWindow(winName, WINDOW_AUTOSIZE);
+
+    // Open a video file or an image file or a camera stream.
+    VideoCapture cap;
+
+    if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
+    {
+        int c = inputName.empty() ? 0 : inputName[0] - '0';
+        std::cout << "Trying to open camera #" << c << " ..." << std::endl;
+        if (!cap.open(c))
+        {
+            std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
+            return 2;
+        }
+    }
+    else if (inputName.size())
+    {
+        inputName = samples::findFileOrKeep(inputName);
+        if (!cap.open(inputName))
+        {
+            std::cout << "Could not open: " << inputName << std::endl;
+            return 2;
+        }
+    }
+
+    // Read the first image.
+    Mat image;
+    cap >> image;
+    if (image.empty())
+    {
+        std::cerr << "Can't capture frame!" << std::endl;
+        return 2;
+    }
+
+    Mat image_select = image.clone();
+    putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+    putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+    Rect selectRect = selectROI(winName, image_select);
+    std::cout << "ROI=" << selectRect << std::endl;
+
+    trackerConfig trackState;
+    trackState.update_scoreSize();
+    trackState.targetBox = Rect2f(
+        float(selectRect.x) + float(selectRect.width) * 0.5f,  // FIXIT don't use center in Rect structures, it is confusing
+        float(selectRect.y) + float(selectRect.height) * 0.5f,
+        float(selectRect.width),
+        float(selectRect.height)
+    );
+
+    // Set tracking template.
+    trackerInit(image, trackState, siamRPN, siamKernelR1, siamKernelCL1);
+
+    TickMeter tickMeter;
+
+    for (int count = 0; ; ++count)
+    {
+        cap >> image;
+        if (image.empty())
+        {
+            std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
+            break;
+        }
+
+        tickMeter.start();
+        float score = trackerEval(image, trackState, siamRPN);
+        tickMeter.stop();
+
+        Rect rect = {
+            int(trackState.targetBox.x - int(trackState.targetBox.width / 2)),
+            int(trackState.targetBox.y - int(trackState.targetBox.height / 2)),
+            int(trackState.targetBox.width),
+            int(trackState.targetBox.height)
+        };
+        std::cout << "frame " << count <<
+            ": predicted score=" << score <<
+            "  rect=" << rect <<
+            "  time=" << tickMeter.getTimeMilli() << "ms" <<
+            std::endl;
+
+        Mat render_image = image.clone();
+        rectangle(render_image, rect, Scalar(0, 255, 0), 2);
+
+        std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
+        std::string scoreLabel = format("Score: %f", score);
+        putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+        imshow(winName, render_image);
+
+        tickMeter.reset();
+
+        int c = waitKey(1);
+        if (c == 27 /*ESC*/)
+            break;
+    }
+
+    std::cout << "Exit" << std::endl;
+    return 0;
+}
+
+Mat generateHanningWindow(const trackerConfig& trackState)
+{
+    Mat baseWindows, HanningWindows;
+
+    createHanningWindow(baseWindows, Size(trackState.scoreSize, trackState.scoreSize), CV_32F);
+    baseWindows = baseWindows.reshape(0, { 1, trackState.scoreSize, trackState.scoreSize });
+    HanningWindows = baseWindows.clone();
+    for (int i = 1; i < trackState.anchorNum; i++)
+    {
+        HanningWindows.push_back(baseWindows);
+    }
+
+    return HanningWindows;
+}
+
+Mat generateAnchors(trackerConfig& trackState)
+{
+    int totalStride = trackState.totalStride, scales = trackState.scale, scoreSize = trackState.scoreSize;
+    std::vector<float> ratios = trackState.ratios;
+    std::vector<Rect2f> baseAnchors;
+    int anchorNum = int(ratios.size());
+    int size = totalStride * totalStride;
+
+    float ori = -(float(scoreSize / 2)) * float(totalStride);
+
+    for (auto i = 0; i < anchorNum; i++)
+    {
+        int ws = int(sqrt(size / ratios[i]));
+        int hs = int(ws * ratios[i]);
+
+        float wws = float(ws) * scales;
+        float hhs = float(hs) * scales;
+        Rect2f anchor = { 0, 0, wws, hhs };
+        baseAnchors.push_back(anchor);
+    }
+
+    int anchorIndex[] = { 0, 0, 0, 0 };
+    const int sizes[] = { 4, (int)ratios.size(), scoreSize, scoreSize };
+    Mat anchors(4, sizes, CV_32F);
+
+    for (auto i = 0; i < scoreSize; i++)
+    {
+        for (auto j = 0; j < scoreSize; j++)
+        {
+            for (auto k = 0; k < anchorNum; k++)
+            {
+                anchorIndex[0] = 1, anchorIndex[1] = k, anchorIndex[2] = i, anchorIndex[3] = j;
+                anchors.at<float>(anchorIndex) = ori + totalStride * i;
+
+                anchorIndex[0] = 0;
+                anchors.at<float>(anchorIndex) = ori + totalStride * j;
+
+                anchorIndex[0] = 2;
+                anchors.at<float>(anchorIndex) = baseAnchors[k].width;
+
+                anchorIndex[0] = 3;
+                anchors.at<float>(anchorIndex) = baseAnchors[k].height;
+            }
+        }
+    }
+
+    return anchors;
+}
+
+Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans)
+{
+    Mat zCrop, dst;
+    Size imgSize = img.size();
+    float c = (originalSize + 1) / 2;
+    float xMin = (float)cvRound(targetBox.x - c);
+    float xMax = xMin + originalSize - 1;
+    float yMin = (float)cvRound(targetBox.y - c);
+    float yMax = yMin + originalSize - 1;
+
+    int leftPad = (int)(fmax(0., -xMin));
+    int topPad = (int)(fmax(0., -yMin));
+    int rightPad = (int)(fmax(0., xMax - imgSize.width + 1));
+    int bottomPad = (int)(fmax(0., yMax - imgSize.height + 1));
+
+    xMin = xMin + leftPad;
+    xMax = xMax + leftPad;
+    yMax = yMax + topPad;
+    yMin = yMin + topPad;
+
+    if (topPad == 0 && bottomPad == 0 && leftPad == 0 && rightPad == 0)
+    {
+        img(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
+    }
+    else
+    {
+        copyMakeBorder(img, dst, topPad, bottomPad, leftPad, rightPad, BORDER_CONSTANT, avgChans);
+        dst(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
+    }
+
+    return zCrop;
+}
+
+void softmax(const Mat& src, Mat& dst)
+{
+    Mat maxVal;
+    cv::max(src.row(1), src.row(0), maxVal);
+
+    src.row(1) -= maxVal;
+    src.row(0) -= maxVal;
+
+    exp(src, dst);
+
+    Mat sumVal = dst.row(0) + dst.row(1);
+    dst.row(0) = dst.row(0) / sumVal;
+    dst.row(1) = dst.row(1) / sumVal;
+}
+
+void elementMax(Mat& src)
+{
+    int* p = src.size.p;
+    int index[] = { 0, 0, 0, 0 };
+    for (int n = 0; n < *p; n++)
+    {
+        for (int k = 0; k < *(p + 1); k++)
+        {
+            for (int i = 0; i < *(p + 2); i++)
+            {
+                for (int j = 0; j < *(p + 3); j++)
+                {
+                    index[0] = n, index[1] = k, index[2] = i, index[3] = j;
+                    float& v = src.at<float>(index);
+                    v = fmax(v, 1.0f / v);
+                }
+            }
+        }
+    }
+}
+
+float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN)
+{
+    Rect2f targetBox = trackState.targetBox;
+
+    float wc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float hc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
+
+    float sz = sqrt(wc * hc);
+    float scaleZ = trackState.exemplarSize / sz;
+
+    float searchSize = float((trackState.instanceSize - trackState.exemplarSize) / 2);
+    float pad = searchSize / scaleZ;
+    float sx = sz + 2 * pad;
+
+    Mat xCrop = getSubwindow(img, targetBox, (float)cvRound(sx), trackState.avgChans);
+
+    static Mat blob;
+    std::vector<Mat> outs;
+    std::vector<String> outNames;
+    Mat delta, score;
+    Mat sc, rc, penalty, pscore;
+
+    blobFromImage(xCrop, blob, 1.0, Size(trackState.instanceSize, trackState.instanceSize), Scalar(), trackState.swapRB, false, CV_32F);
+
+    siamRPN.setInput(blob);
+
+    outNames = siamRPN.getUnconnectedOutLayersNames();
+    siamRPN.forward(outs, outNames);
+
+    delta = outs[0];
+    score = outs[1];
+
+    score = score.reshape(0, { 2, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+    delta = delta.reshape(0, { 4, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+
+    softmax(score, score);
+
+    targetBox.width *= scaleZ;
+    targetBox.height *= scaleZ;
+
+    score = score.row(1);
+    score = score.reshape(0, { 5, 19, 19 });
+
+    // Post processing
+    delta.row(0) = delta.row(0).mul(trackState.anchors.row(2)) + trackState.anchors.row(0);
+    delta.row(1) = delta.row(1).mul(trackState.anchors.row(3)) + trackState.anchors.row(1);
+    exp(delta.row(2), delta.row(2));
+    delta.row(2) = delta.row(2).mul(trackState.anchors.row(2));
+    exp(delta.row(3), delta.row(3));
+    delta.row(3) = delta.row(3).mul(trackState.anchors.row(3));
+
+    sc = sizeCal(delta.row(2), delta.row(3)) / sizeCal(targetBox.width, targetBox.height);
+    elementMax(sc);
+
+    rc = delta.row(2).mul(1 / delta.row(3));
+    rc = (targetBox.width / targetBox.height) / rc;
+    elementMax(rc);
+
+    // Calculating the penalty
+    exp(((rc.mul(sc) - 1.) * trackState.penaltyK * (-1.0)), penalty);
+    penalty = penalty.reshape(0, { trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+
+    pscore = penalty.mul(score);
+    pscore = pscore * (1.0 - trackState.windowInfluence) + trackState.windows * trackState.windowInfluence;
+
+    int bestID[] = { 0 };
+    // Find the index of best score.
+    minMaxIdx(pscore.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 }), 0, 0, 0, bestID);
+    delta = delta.reshape(0, { 4, trackState.anchorNum * trackState.scoreSize * trackState.scoreSize });
+    penalty = penalty.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
+    score = score.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
+
+    int index[] = { 0, bestID[0] };
+    Rect2f resBox = { 0, 0, 0, 0 };
+
+    resBox.x = delta.at<float>(index) / scaleZ;
+    index[0] = 1;
+    resBox.y = delta.at<float>(index) / scaleZ;
+    index[0] = 2;
+    resBox.width = delta.at<float>(index) / scaleZ;
+    index[0] = 3;
+    resBox.height = delta.at<float>(index) / scaleZ;
+
+    float lr = penalty.at<float>(bestID) * score.at<float>(bestID) * trackState.lr;
+
+    resBox.x = resBox.x + targetBox.x;
+    resBox.y = resBox.y + targetBox.y;
+    targetBox.width /= scaleZ;
+    targetBox.height /= scaleZ;
+
+    resBox.width = targetBox.width * (1 - lr) + resBox.width * lr;
+    resBox.height = targetBox.height * (1 - lr) + resBox.height * lr;
+
+    resBox.x = float(fmax(0., fmin(float(trackState.imgSize.width), resBox.x)));
+    resBox.y = float(fmax(0., fmin(float(trackState.imgSize.height), resBox.y)));
+    resBox.width = float(fmax(10., fmin(float(trackState.imgSize.width), resBox.width)));
+    resBox.height = float(fmax(10., fmin(float(trackState.imgSize.height), resBox.height)));
+
+    trackState.targetBox = resBox;
+    return score.at<float>(bestID);
+}
+
+void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1)
+{
+    Rect2f targetBox = trackState.targetBox;
+    Mat anchors = generateAnchors(trackState);
+    trackState.anchors = anchors;
+
+    Mat windows = generateHanningWindow(trackState);
+
+    trackState.windows = windows;
+    trackState.imgSize = img.size();
+
+    trackState.avgChans = mean(img);
+    float wc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float hc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float sz = (float)cvRound(sqrt(wc * hc));
+
+    Mat zCrop = getSubwindow(img, targetBox, sz, trackState.avgChans);
+    static Mat blob;
+
+    blobFromImage(zCrop, blob, 1.0, Size(trackState.exemplarSize, trackState.exemplarSize), Scalar(), trackState.swapRB, false, CV_32F);
+    siamRPN.setInput(blob);
+    Mat out1;
+    siamRPN.forward(out1, "63");
+
+    siamKernelCL1.setInput(out1);
+    siamKernelR1.setInput(out1);
+
+    Mat cls1 = siamKernelCL1.forward();
+    Mat r1 = siamKernelR1.forward();
+    std::vector<int> r1_shape = { 20, 256, 4, 4 }, cls1_shape = { 10, 256, 4, 4 };
+
+    siamRPN.setParam(siamRPN.getLayerId("65"), 0, r1.reshape(0, r1_shape));
+    siamRPN.setParam(siamRPN.getLayerId("68"), 0, cls1.reshape(0, cls1_shape));
+}
+
+int main(int argc, char **argv)
+{
+    try
+    {
+        return run(argc, argv);
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
+        return 1;
+    }
+}
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/abstract_model.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/abstract_model.py
new file mode 100644
index 0000000000..c968c53dc2
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/abstract_model.py
@@ -0,0 +1,23 @@
+from abc import ABC, ABCMeta, abstractmethod
+
+
+class AbstractModel(ABC):
+
+    @abstractmethod
+    def get_prepared_models(self):
+        pass
+
+
+class Framework(object):
+    in_blob_name = ''
+    out_blob_name = ''
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_name(self):
+        pass
+
+    @abstractmethod
+    def get_output(self, input_blob):
+        pass
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_accuracy_evaluator.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_accuracy_evaluator.py
new file mode 100644
index 0000000000..5028f9248d
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_accuracy_evaluator.py
@@ -0,0 +1,96 @@
+import sys
+import time
+
+import numpy as np
+
+from ...utils import get_final_summary_info
+
+
+class ClsAccEvaluation:
+    log = sys.stdout
+    img_classes = {}
+    batch_size = 0
+
+    def __init__(self, log_path, img_classes_file, batch_size):
+        self.log = open(log_path, 'w')
+        self.img_classes = self.read_classes(img_classes_file)
+        self.batch_size = batch_size
+
+        # collect the accuracies for both models
+        self.general_quality_metric = []
+        self.general_inference_time = []
+
+    @staticmethod
+    def read_classes(img_classes_file):
+        result = {}
+        with open(img_classes_file) as file:
+            for l in file.readlines():
+                result[l.split()[0]] = int(l.split()[1])
+        return result
+
+    def get_correct_answers(self, img_list, net_output_blob):
+        correct_answers = 0
+        for i in range(len(img_list)):
+            indexes = np.argsort(net_output_blob[i])[-5:]
+            correct_index = self.img_classes[img_list[i]]
+            if correct_index in indexes:
+                correct_answers += 1
+        return correct_answers
+
+    def process(self, frameworks, data_fetcher):
+        sorted_imgs_names = sorted(self.img_classes.keys())
+        correct_answers = [0] * len(frameworks)
+        samples_handled = 0
+        blobs_l1_diff = [0] * len(frameworks)
+        blobs_l1_diff_count = [0] * len(frameworks)
+        blobs_l_inf_diff = [sys.float_info.min] * len(frameworks)
+        inference_time = [0.0] * len(frameworks)
+
+        for x in range(0, len(sorted_imgs_names), self.batch_size):
+            sublist = sorted_imgs_names[x:x + self.batch_size]
+            batch = data_fetcher.get_batch(sublist)
+
+            samples_handled += len(sublist)
+            fw_accuracy = []
+            fw_time = []
+            frameworks_out = []
+            for i in range(len(frameworks)):
+                start = time.time()
+                out = frameworks[i].get_output(batch)
+                end = time.time()
+                correct_answers[i] += self.get_correct_answers(sublist, out)
+                fw_accuracy.append(100 * correct_answers[i] / float(samples_handled))
+                frameworks_out.append(out)
+                inference_time[i] += end - start
+                fw_time.append(inference_time[i] / samples_handled * 1000)
+                print(samples_handled, 'Accuracy for', frameworks[i].get_name() + ':', fw_accuracy[i], file=self.log)
+                print("Inference time, ms ", frameworks[i].get_name(), fw_time[i], file=self.log)
+
+                self.general_quality_metric.append(fw_accuracy)
+                self.general_inference_time.append(fw_time)
+
+            for i in range(1, len(frameworks)):
+                log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
+                diff = np.abs(frameworks_out[0] - frameworks_out[i])
+                l1_diff = np.sum(diff) / diff.size
+                print(samples_handled, "L1 difference", log_str, l1_diff, file=self.log)
+                blobs_l1_diff[i] += l1_diff
+                blobs_l1_diff_count[i] += 1
+                if np.max(diff) > blobs_l_inf_diff[i]:
+                    blobs_l_inf_diff[i] = np.max(diff)
+                print(samples_handled, "L_INF difference", log_str, blobs_l_inf_diff[i], file=self.log)
+
+            self.log.flush()
+
+        for i in range(1, len(blobs_l1_diff)):
+            log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
+            print('Final l1 diff', log_str, blobs_l1_diff[i] / blobs_l1_diff_count[i], file=self.log)
+
+        print(
+            get_final_summary_info(
+                self.general_quality_metric,
+                self.general_inference_time,
+                "accuracy"
+            ),
+            file=self.log
+        )
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_data_fetcher.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_data_fetcher.py
new file mode 100644
index 0000000000..805f540669
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/evaluation/classification/cls_data_fetcher.py
@@ -0,0 +1,87 @@
+import os
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import numpy as np
+
+from ...img_utils import read_rgb_img, get_pytorch_preprocess
+from ...test.configs.default_preprocess_config import PYTORCH_RSZ_HEIGHT, PYTORCH_RSZ_WIDTH
+
+
+class DataFetch(object):
+    imgs_dir = ''
+    frame_size = 0
+    bgr_to_rgb = False
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def preprocess(self, img):
+        pass
+
+    @staticmethod
+    def reshape_img(img):
+        img = img[:, :, 0:3].transpose(2, 0, 1)
+        return np.expand_dims(img, 0)
+
+    def center_crop(self, img):
+        cols = img.shape[1]
+        rows = img.shape[0]
+
+        y1 = round((rows - self.frame_size) / 2)
+        y2 = round(y1 + self.frame_size)
+        x1 = round((cols - self.frame_size) / 2)
+        x2 = round(x1 + self.frame_size)
+        return img[y1:y2, x1:x2]
+
+    def initial_preprocess(self, img):
+        min_dim = min(img.shape[-3], img.shape[-2])
+        resize_ratio = self.frame_size / float(min_dim)
+
+        img = cv2.resize(img, (0, 0), fx=resize_ratio, fy=resize_ratio)
+        img = self.center_crop(img)
+        return img
+
+    def get_preprocessed_img(self, img_path):
+        image_data = read_rgb_img(img_path, self.bgr_to_rgb)
+        image_data = self.preprocess(image_data)
+        return self.reshape_img(image_data)
+
+    def get_batch(self, img_names):
+        assert type(img_names) is list
+        batch = np.zeros((len(img_names), 3, self.frame_size, self.frame_size)).astype(np.float32)
+
+        for i in range(len(img_names)):
+            img_name = img_names[i]
+            img_file = os.path.join(self.imgs_dir, img_name)
+            assert os.path.exists(img_file)
+
+            batch[i] = self.get_preprocessed_img(img_file)
+        return batch
+
+
+class PyTorchPreprocessedFetch(DataFetch):
+    def __init__(self, pytorch_cls_config, preprocess_input=None):
+        self.imgs_dir = pytorch_cls_config.img_root_dir
+        self.frame_size = pytorch_cls_config.frame_size
+        self.bgr_to_rgb = pytorch_cls_config.bgr_to_rgb
+        self.preprocess_input = preprocess_input
+
+    def preprocess(self, img):
+        img = cv2.resize(img, (PYTORCH_RSZ_WIDTH, PYTORCH_RSZ_HEIGHT))
+        img = self.center_crop(img)
+        if self.preprocess_input:
+            return self.presprocess_input(img)
+        return get_pytorch_preprocess(img)
+
+
+class TFPreprocessedFetch(DataFetch):
+    def __init__(self, tf_cls_config, preprocess_input):
+        self.imgs_dir = tf_cls_config.img_root_dir
+        self.frame_size = tf_cls_config.frame_size
+        self.bgr_to_rgb = tf_cls_config.bgr_to_rgb
+        self.preprocess_input = preprocess_input
+
+    def preprocess(self, img):
+        img = self.initial_preprocess(img)
+        return self.preprocess_input(img)
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/img_utils.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/img_utils.py
new file mode 100644
index 0000000000..3e17ec8755
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/img_utils.py
@@ -0,0 +1,19 @@
+import cv2
+import numpy as np
+
+from .test.configs.default_preprocess_config import BASE_IMG_SCALE_FACTOR
+
+
+def read_rgb_img(img_file, is_bgr_to_rgb=True):
+    img = cv2.imread(img_file, cv2.IMREAD_COLOR)
+    if is_bgr_to_rgb:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+
+
+def get_pytorch_preprocess(img):
+    img = img.astype(np.float32)
+    img *= BASE_IMG_SCALE_FACTOR
+    img -= [0.485, 0.456, 0.406]
+    img /= [0.229, 0.224, 0.225]
+    return img
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/test/cls_model_test_pipeline.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/cls_model_test_pipeline.py
new file mode 100644
index 0000000000..757d5130b5
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/cls_model_test_pipeline.py
@@ -0,0 +1,60 @@
+from .configs.test_config import TestClsConfig, TestClsModuleConfig
+from .model_test_pipeline import ModelTestPipeline
+from ..evaluation.classification.cls_accuracy_evaluator import ClsAccEvaluation
+from ..utils import get_test_module
+
+
+class ClsModelTestPipeline(ModelTestPipeline):
+    def __init__(
+            self,
+            network_model,
+            model_processor,
+            dnn_model_processor,
+            data_fetcher,
+            img_processor=None,
+            cls_args_parser=None,
+            default_input_blob_preproc=None
+    ):
+        super(ClsModelTestPipeline, self).__init__(
+            network_model,
+            model_processor,
+            dnn_model_processor
+        )
+
+        if cls_args_parser:
+            self._parser = cls_args_parser
+
+        self.test_config = TestClsConfig()
+
+        parser_args = self._parser.parse_args()
+
+        if parser_args.test:
+            self._test_module_config = TestClsModuleConfig()
+            self._test_module = get_test_module(
+                self._test_module_config.test_module_name,
+                self._test_module_config.test_module_path
+            )
+
+            if parser_args.default_img_preprocess:
+                self._default_input_blob_preproc = default_input_blob_preproc
+        if parser_args.evaluate:
+            self._data_fetcher = data_fetcher(self.test_config, img_processor)
+
+    def _configure_test_module_params(self):
+        self._test_module_param_list.extend((
+            '--crop', self._test_module_config.crop,
+            '--std', *self._test_module_config.std
+        ))
+
+        if self._test_module_config.rsz_height and self._test_module_config.rsz_width:
+            self._test_module_param_list.extend((
+                '--initial_height', self._test_module_config.rsz_height,
+                '--initial_width', self._test_module_config.rsz_width,
+            ))
+
+    def _configure_acc_eval(self, log_path):
+        self._accuracy_evaluator = ClsAccEvaluation(
+            log_path,
+            self.test_config.img_cls_file,
+            self.test_config.batch_size
+        )
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/default_preprocess_config.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/default_preprocess_config.py
new file mode 100644
index 0000000000..e7ade9158d
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/default_preprocess_config.py
@@ -0,0 +1,37 @@
+BASE_IMG_SCALE_FACTOR = 1 / 255.0
+PYTORCH_RSZ_HEIGHT = 256
+PYTORCH_RSZ_WIDTH = 256
+
+pytorch_resize_input_blob = {
+    "mean": ["123.675", "116.28", "103.53"],
+    "scale": str(BASE_IMG_SCALE_FACTOR),
+    "std": ["0.229", "0.224", "0.225"],
+    "crop": "True",
+    "rgb": True,
+    "rsz_height": str(PYTORCH_RSZ_HEIGHT),
+    "rsz_width": str(PYTORCH_RSZ_WIDTH)
+}
+
+pytorch_input_blob = {
+    "mean": ["123.675", "116.28", "103.53"],
+    "scale": str(BASE_IMG_SCALE_FACTOR),
+    "std": ["0.229", "0.224", "0.225"],
+    "crop": "True",
+    "rgb": True
+}
+
+tf_input_blob = {
+    "scale": str(1 / 127.5),
+    "mean": ["127.5", "127.5", "127.5"],
+    "std": [],
+    "crop": "True",
+    "rgb": True
+}
+
+tf_model_blob_caffe_mode = {
+    "mean": ["103.939", "116.779", "123.68"],
+    "scale": "1.0",
+    "std": [],
+    "crop": "True",
+    "rgb": False
+}
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py
new file mode 100644
index 0000000000..55680918e6
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/configs/test_config.py
@@ -0,0 +1,40 @@
+import os
+from dataclasses import dataclass, field
+from typing import List
+
+
+@dataclass
+class CommonConfig:
+    output_data_root_dir: str = "dnn_model_runner/dnn_conversion"
+    logs_dir: str = os.path.join(output_data_root_dir, "logs")
+    log_file_path: str = os.path.join(logs_dir, "{}_log.txt")
+
+
+@dataclass
+class TestClsConfig:
+    batch_size: int = 1
+    frame_size: int = 224
+    img_root_dir: str = "./ILSVRC2012_img_val"
+    # location of image-class matching
+    img_cls_file: str = "./val.txt"
+    bgr_to_rgb: bool = True
+
+
+@dataclass
+class TestClsModuleConfig:
+    cls_test_data_dir: str = "../data"
+    test_module_name: str = "classification"
+    test_module_path: str = "classification.py"
+    input_img: str = os.path.join(cls_test_data_dir, "squirrel_cls.jpg")
+    model: str = ""
+
+    frame_height: str = str(TestClsConfig.frame_size)
+    frame_width: str = str(TestClsConfig.frame_size)
+    scale: str = "1.0"
+    mean: List[str] = field(default_factory=lambda: ["0.0", "0.0", "0.0"])
+    std: List[str] = field(default_factory=list)
+    crop: str = "False"
+    rgb: str = "True"
+    rsz_height: str = ""
+    rsz_width: str = ""
+    classes: str = os.path.join(cls_test_data_dir, "dnn", "classification_classes_ILSVRC2012.txt")
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/test/model_test_pipeline.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/model_test_pipeline.py
new file mode 100644
index 0000000000..38b9c38e60
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/test/model_test_pipeline.py
@@ -0,0 +1,126 @@
+import os
+
+import numpy as np
+
+from .configs.test_config import CommonConfig
+from ..utils import create_parser, plot_acc
+
+
+class ModelTestPipeline:
+    def __init__(
+            self,
+            network_model,
+            model_processor,
+            dnn_model_processor
+    ):
+        self._net_model = network_model
+        self._model_processor = model_processor
+        self._dnn_model_processor = dnn_model_processor
+
+        self._parser = create_parser()
+
+        self._test_module = None
+        self._test_module_config = None
+        self._test_module_param_list = None
+
+        self.test_config = None
+        self._data_fetcher = None
+
+        self._default_input_blob_preproc = None
+        self._accuracy_evaluator = None
+
+    def init_test_pipeline(self):
+        cmd_args = self._parser.parse_args()
+        model_dict = self._net_model.get_prepared_models()
+
+        model_names = list(model_dict.keys())
+        print(
+            "The model {} was successfully obtained and converted to OpenCV {}".format(model_names[0], model_names[1])
+        )
+
+        if cmd_args.test:
+            if not self._test_module_config.model:
+                self._test_module_config.model = self._net_model.model_path["full_path"]
+
+            if cmd_args.default_img_preprocess:
+                self._test_module_config.scale = self._default_input_blob_preproc["scale"]
+                self._test_module_config.mean = self._default_input_blob_preproc["mean"]
+                self._test_module_config.std = self._default_input_blob_preproc["std"]
+                self._test_module_config.crop = self._default_input_blob_preproc["crop"]
+
+                if "rsz_height" in self._default_input_blob_preproc and "rsz_width" in self._default_input_blob_preproc:
+                    self._test_module_config.rsz_height = self._default_input_blob_preproc["rsz_height"]
+                    self._test_module_config.rsz_width = self._default_input_blob_preproc["rsz_width"]
+
+                self._test_module_param_list = [
+                    '--model', self._test_module_config.model,
+                    '--input', self._test_module_config.input_img,
+                    '--width', self._test_module_config.frame_width,
+                    '--height', self._test_module_config.frame_height,
+                    '--scale', self._test_module_config.scale,
+                    '--mean', *self._test_module_config.mean,
+                    '--std', *self._test_module_config.std,
+                    '--classes', self._test_module_config.classes,
+                ]
+
+                if self._default_input_blob_preproc["rgb"]:
+                    self._test_module_param_list.append('--rgb')
+
+                self._configure_test_module_params()
+
+            self._test_module.main(
+                self._test_module_param_list
+            )
+
+        if cmd_args.evaluate:
+            original_model_name = model_names[0]
+            dnn_model_name = model_names[1]
+
+            self.run_test_pipeline(
+                [
+                    self._model_processor(model_dict[original_model_name], original_model_name),
+                    self._dnn_model_processor(model_dict[dnn_model_name], dnn_model_name)
+                ],
+                original_model_name.replace(" ", "_")
+            )
+
+    def run_test_pipeline(
+            self,
+            models_list,
+            formatted_exp_name,
+            is_plot_acc=True
+    ):
+        log_path, logs_dir = self._configure_eval_log(formatted_exp_name)
+
+        print(
+            "===== Running evaluation of the model with the following params:\n"
+            "\t* val data location: {}\n"
+            "\t* log file location: {}\n".format(
+                self.test_config.img_root_dir,
+                log_path
+            )
+        )
+
+        os.makedirs(logs_dir, exist_ok=True)
+
+        self._configure_acc_eval(log_path)
+        self._accuracy_evaluator.process(models_list, self._data_fetcher)
+
+        if is_plot_acc:
+            plot_acc(
+                np.array(self._accuracy_evaluator.general_inference_time),
+                formatted_exp_name
+            )
+
+        print("===== End of the evaluation pipeline =====")
+
+    def _configure_acc_eval(self, log_path):
+        pass
+
+    def _configure_test_module_params(self):
+        pass
+
+    @staticmethod
+    def _configure_eval_log(formatted_exp_name):
+        common_test_config = CommonConfig()
+        return common_test_config.log_file_path.format(formatted_exp_name), common_test_config.logs_dir
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py b/samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
new file mode 100644
index 0000000000..cf24dd385b
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
@@ -0,0 +1,153 @@
+import argparse
+import importlib.util
+import os
+import random
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import torch
+
+from .test.configs.test_config import CommonConfig
+
+SEED_VAL = 42
+DNN_LIB = "DNN"
+# common path for model savings
+MODEL_PATH_ROOT = os.path.join(CommonConfig().output_data_root_dir, "{}/models")
+
+
+def get_full_model_path(lib_name, model_full_name):
+    model_path = MODEL_PATH_ROOT.format(lib_name)
+    return {
+        "path": model_path,
+        "full_path": os.path.join(model_path, model_full_name)
+    }
+
+
+def plot_acc(data_list, experiment_name):
+    plt.figure(figsize=[8, 6])
+    plt.plot(data_list[:, 0], "r", linewidth=2.5, label="Original Model")
+    plt.plot(data_list[:, 1], "b", linewidth=2.5, label="Converted DNN Model")
+    plt.xlabel("Iterations ", fontsize=15)
+    plt.ylabel("Time (ms)", fontsize=15)
+    plt.title(experiment_name, fontsize=15)
+    plt.legend()
+    full_path_to_fig = os.path.join(CommonConfig().output_data_root_dir, experiment_name + ".png")
+    plt.savefig(full_path_to_fig, bbox_inches="tight")
+
+
+def get_final_summary_info(general_quality_metric, general_inference_time, metric_name):
+    general_quality_metric = np.array(general_quality_metric)
+    general_inference_time = np.array(general_inference_time)
+    summary_line = "===== End of processing. General results:\n"
+    "\t* mean {} for the original model: {}\t"
+    "\t* mean time (min) for the original model inferences: {}\n"
+    "\t* mean {} for the DNN model: {}\t"
+    "\t* mean time (min) for the DNN model inferences: {}\n".format(
+        metric_name, np.mean(general_quality_metric[:, 0]),
+        np.mean(general_inference_time[:, 0]) / 60000,
+        metric_name, np.mean(general_quality_metric[:, 1]),
+        np.mean(general_inference_time[:, 1]) / 60000,
+    )
+    return summary_line
+
+
+def set_common_reproducibility():
+    random.seed(SEED_VAL)
+    np.random.seed(SEED_VAL)
+
+
+def set_pytorch_env():
+    set_common_reproducibility()
+    torch.manual_seed(SEED_VAL)
+    torch.set_printoptions(precision=10)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(SEED_VAL)
+        torch.backends.cudnn_benchmark_enabled = False
+        torch.backends.cudnn.deterministic = True
+
+
+def set_tf_env(is_use_gpu=True):
+    set_common_reproducibility()
+    tf.random.set_seed(SEED_VAL)
+    os.environ["TF_DETERMINISTIC_OPS"] = "1"
+
+    if tf.config.list_physical_devices("GPU") and is_use_gpu:
+        gpu_devices = tf.config.list_physical_devices("GPU")
+        tf.config.experimental.set_visible_devices(gpu_devices[0], "GPU")
+        tf.config.experimental.set_memory_growth(gpu_devices[0], True)
+        os.environ["TF_USE_CUDNN"] = "1"
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+
+def str_bool(input_val):
+    if input_val.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif input_val.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value was expected')
+
+
+def get_formatted_model_list(model_list):
+    note_line = 'Please, choose the model from the below list:\n'
+    spaces_to_set = ' ' * (len(note_line) - 2)
+    return note_line + ''.join([spaces_to_set, '{} \n'] * len(model_list)).format(*model_list)
+
+
+def model_str(model_list):
+    def type_model_list(input_val):
+        if input_val.lower() in model_list:
+            return input_val.lower()
+        else:
+            raise argparse.ArgumentTypeError(
+                'The model is currently unavailable for test.\n' +
+                get_formatted_model_list(model_list)
+            )
+
+    return type_model_list
+
+
+def get_test_module(test_module_name, test_module_path):
+    module_spec = importlib.util.spec_from_file_location(test_module_name, test_module_path)
+    test_module = importlib.util.module_from_spec(module_spec)
+    module_spec.loader.exec_module(test_module)
+    module_spec.loader.exec_module(test_module)
+    return test_module
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--test",
+        type=str_bool,
+        help="Define whether you'd like to run the model with OpenCV for testing.",
+        default=False
+    ),
+    parser.add_argument(
+        "--default_img_preprocess",
+        type=str_bool,
+        help="Define whether you'd like to preprocess the input image with defined"
+             " PyTorch or TF functions for model test with OpenCV.",
+        default=False
+    ),
+    parser.add_argument(
+        "--evaluate",
+        type=str_bool,
+        help="Define whether you'd like to run evaluation of the models (ex.: TF vs OpenCV networks).",
+        default=True
+    )
+    return parser
+
+
+def create_extended_parser(model_list):
+    parser = create_parser()
+    parser.add_argument(
+        "--model_name",
+        type=model_str(model_list=model_list),
+        help="\nDefine the model name to test.\n" +
+             get_formatted_model_list(model_list),
+        required=True
+    )
+    return parser
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_cls.py b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_cls.py
new file mode 100644
index 0000000000..abbd7389f8
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_cls.py
@@ -0,0 +1,71 @@
+from torchvision import models
+
+from ..pytorch_model import (
+    PyTorchModelPreparer,
+    PyTorchModelProcessor,
+    PyTorchDnnModelProcessor
+)
+from ...common.evaluation.classification.cls_data_fetcher import PyTorchPreprocessedFetch
+from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
+from ...common.test.configs.default_preprocess_config import pytorch_resize_input_blob
+from ...common.test.configs.test_config import TestClsConfig
+from ...common.utils import set_pytorch_env, create_extended_parser
+
+model_dict = {
+    "alexnet": models.alexnet,
+
+    "vgg11": models.vgg11,
+    "vgg13": models.vgg13,
+    "vgg16": models.vgg16,
+    "vgg19": models.vgg19,
+
+    "resnet18": models.resnet18,
+    "resnet34": models.resnet34,
+    "resnet50": models.resnet50,
+    "resnet101": models.resnet101,
+    "resnet152": models.resnet152,
+
+    "squeezenet1_0": models.squeezenet1_0,
+    "squeezenet1_1": models.squeezenet1_1,
+
+    "resnext50_32x4d": models.resnext50_32x4d,
+    "resnext101_32x8d": models.resnext101_32x8d,
+
+    "wide_resnet50_2": models.wide_resnet50_2,
+    "wide_resnet101_2": models.wide_resnet101_2
+}
+
+
+class PyTorchClsModel(PyTorchModelPreparer):
+    def __init__(self, height, width, model_name, original_model):
+        super(PyTorchClsModel, self).__init__(height, width, model_name, original_model)
+
+
+def main():
+    set_pytorch_env()
+
+    parser = create_extended_parser(list(model_dict.keys()))
+    cmd_args = parser.parse_args()
+    model_name = cmd_args.model_name
+
+    cls_model = PyTorchClsModel(
+        height=TestClsConfig().frame_size,
+        width=TestClsConfig().frame_size,
+        model_name=model_name,
+        original_model=model_dict[model_name](pretrained=True)
+    )
+
+    pytorch_cls_pipeline = ClsModelTestPipeline(
+        network_model=cls_model,
+        model_processor=PyTorchModelProcessor,
+        dnn_model_processor=PyTorchDnnModelProcessor,
+        data_fetcher=PyTorchPreprocessedFetch,
+        cls_args_parser=parser,
+        default_input_blob_preproc=pytorch_resize_input_blob
+    )
+
+    pytorch_cls_pipeline.init_test_pipeline()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50.py b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50.py
new file mode 100644
index 0000000000..3a228bf1da
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50.py
@@ -0,0 +1,139 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.onnx
+from torch.autograd import Variable
+from torchvision import models
+
+
+def get_pytorch_onnx_model(original_model):
+    # define the directory for further converted model save
+    onnx_model_path = "models"
+    # define the name of further converted model
+    onnx_model_name = "resnet50.onnx"
+
+    # create directory for further converted model
+    os.makedirs(onnx_model_path, exist_ok=True)
+
+    # get full path to the converted model
+    full_model_path = os.path.join(onnx_model_path, onnx_model_name)
+
+    # generate model input
+    generated_input = Variable(
+        torch.randn(1, 3, 224, 224)
+    )
+
+    # model export into ONNX format
+    torch.onnx.export(
+        original_model,
+        generated_input,
+        full_model_path,
+        verbose=True,
+        input_names=["input"],
+        output_names=["output"],
+        opset_version=11
+    )
+
+    return full_model_path
+
+
+def get_preprocessed_img(img_path):
+    # read the image
+    input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+    input_img = input_img.astype(np.float32)
+
+    input_img = cv2.resize(input_img, (256, 256))
+
+    # define preprocess parameters
+    mean = np.array([0.485, 0.456, 0.406]) * 255.0
+    scale = 1 / 255.0
+    std = [0.229, 0.224, 0.225]
+
+    # prepare input blob to fit the model input:
+    # 1. subtract mean
+    # 2. scale to set pixel values from 0 to 1
+    input_blob = cv2.dnn.blobFromImage(
+        image=input_img,
+        scalefactor=scale,
+        size=(224, 224),  # img target size
+        mean=mean,
+        swapRB=True,  # BGR -> RGB
+        crop=True  # center crop
+    )
+    # 3. divide by std
+    input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
+    return input_blob
+
+
+def get_imagenet_labels(labels_path):
+    with open(labels_path) as f:
+        imagenet_labels = [line.strip() for line in f.readlines()]
+    return imagenet_labels
+
+
+def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
+    # set OpenCV DNN input
+    opencv_net.setInput(preproc_img)
+
+    # OpenCV DNN inference
+    out = opencv_net.forward()
+    print("OpenCV DNN prediction: \n")
+    print("* shape: ", out.shape)
+
+    # get the predicted class ID
+    imagenet_class_id = np.argmax(out)
+
+    # get confidence
+    confidence = out[0][imagenet_class_id]
+    print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+    print("* confidence: {:.4f}".format(confidence))
+
+
+def get_pytorch_dnn_prediction(original_net, preproc_img, imagenet_labels):
+    original_net.eval()
+    preproc_img = torch.FloatTensor(preproc_img)
+
+    # inference
+    with torch.no_grad():
+        out = original_net(preproc_img)
+
+    print("\nPyTorch model prediction: \n")
+    print("* shape: ", out.shape)
+
+    # get the predicted class ID
+    imagenet_class_id = torch.argmax(out, axis=1).item()
+    print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+
+    # get confidence
+    confidence = out[0][imagenet_class_id]
+    print("* confidence: {:.4f}".format(confidence.item()))
+
+
+def main():
+    # initialize PyTorch ResNet-50 model
+    original_model = models.resnet50(pretrained=True)
+
+    # get the path to the converted into ONNX PyTorch model
+    full_model_path = get_pytorch_onnx_model(original_model)
+
+    # read converted .onnx model with OpenCV API
+    opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
+    print("OpenCV model was successfully read. Layer IDs: \n", opencv_net.getLayerNames())
+
+    # get preprocessed image
+    input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
+
+    # get ImageNet labels
+    imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
+
+    # obtain OpenCV DNN predictions
+    get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
+
+    # obtain original PyTorch ResNet50 predictions
+    get_pytorch_dnn_prediction(original_model, input_img, imagenet_labels)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50_onnx.py b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50_onnx.py
new file mode 100644
index 0000000000..969c359946
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/classification/py_to_py_resnet50_onnx.py
@@ -0,0 +1,50 @@
+import os
+
+import torch
+import torch.onnx
+from torch.autograd import Variable
+from torchvision import models
+
+
+def get_pytorch_onnx_model(original_model):
+    # define the directory for further converted model save
+    onnx_model_path = "models"
+    # define the name of further converted model
+    onnx_model_name = "resnet50.onnx"
+
+    # create directory for further converted model
+    os.makedirs(onnx_model_path, exist_ok=True)
+
+    # get full path to the converted model
+    full_model_path = os.path.join(onnx_model_path, onnx_model_name)
+
+    # generate model input
+    generated_input = Variable(
+        torch.randn(1, 3, 224, 224)
+    )
+
+    # model export into ONNX format
+    torch.onnx.export(
+        original_model,
+        generated_input,
+        full_model_path,
+        verbose=True,
+        input_names=["input"],
+        output_names=["output"],
+        opset_version=11
+    )
+
+    return full_model_path
+
+
+def main():
+    # initialize PyTorch ResNet-50 model
+    original_model = models.resnet50(pretrained=True)
+
+    # get the path to the converted into ONNX PyTorch model
+    full_model_path = get_pytorch_onnx_model(original_model)
+    print("PyTorch ResNet-50 model was successfully converted: ", full_model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/pytorch_model.py b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/pytorch_model.py
new file mode 100644
index 0000000000..cb32004c55
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/pytorch_model.py
@@ -0,0 +1,98 @@
+import os
+
+import cv2
+import torch.onnx
+from torch.autograd import Variable
+
+from ..common.abstract_model import AbstractModel, Framework
+from ..common.utils import DNN_LIB, get_full_model_path
+
+CURRENT_LIB = "PyTorch"
+MODEL_FORMAT = ".onnx"
+
+
+class PyTorchModelPreparer(AbstractModel):
+
+    def __init__(
+            self,
+            height,
+            width,
+            model_name="default",
+            original_model=object,
+            batch_size=1,
+            default_input_name="input",
+            default_output_name="output"
+    ):
+        self._height = height
+        self._width = width
+        self._model_name = model_name
+        self._original_model = original_model
+        self._batch_size = batch_size
+        self._default_input_name = default_input_name
+        self._default_output_name = default_output_name
+
+        self.model_path = self._set_model_path()
+        self._dnn_model = self._set_dnn_model()
+
+    def _set_dnn_model(self):
+        generated_input = Variable(torch.randn(
+            self._batch_size, 3, self._height, self._width)
+        )
+        os.makedirs(self.model_path["path"], exist_ok=True)
+        torch.onnx.export(
+            self._original_model,
+            generated_input,
+            self.model_path["full_path"],
+            verbose=True,
+            input_names=[self._default_input_name],
+            output_names=[self._default_output_name],
+            opset_version=11
+        )
+
+        return cv2.dnn.readNetFromONNX(self.model_path["full_path"])
+
+    def _set_model_path(self):
+        model_to_save = self._model_name + MODEL_FORMAT
+        return get_full_model_path(CURRENT_LIB.lower(), model_to_save)
+
+    def get_prepared_models(self):
+        return {
+            CURRENT_LIB + " " + self._model_name: self._original_model,
+            DNN_LIB + " " + self._model_name: self._dnn_model
+        }
+
+
+class PyTorchModelProcessor(Framework):
+    def __init__(self, prepared_model, model_name):
+        self._prepared_model = prepared_model
+        self._name = model_name
+
+    def get_output(self, input_blob):
+        tensor = torch.FloatTensor(input_blob)
+        self._prepared_model.eval()
+
+        with torch.no_grad():
+            model_out = self._prepared_model(tensor)
+
+        # segmentation case
+        if len(model_out) == 2:
+            model_out = model_out['out']
+
+        out = model_out.detach().numpy()
+        return out
+
+    def get_name(self):
+        return self._name
+
+
+class PyTorchDnnModelProcessor(Framework):
+    def __init__(self, prepared_dnn_model, model_name):
+        self._prepared_dnn_model = prepared_dnn_model
+        self._name = model_name
+
+    def get_output(self, input_blob):
+        self._prepared_dnn_model.setInput(input_blob, '')
+        return self._prepared_dnn_model.forward()
+
+    def get_name(self):
+        return self._name
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt b/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
new file mode 100644
index 0000000000..65ab56ad66
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
@@ -0,0 +1,9 @@
+# Python 3.7.5
+onnx>=1.7.0
+numpy>=1.19.1
+
+torch>=1.5.1
+torchvision>=0.6.1
+
+tensorflow>=2.1.0
+tensorflow-gpu>=2.1.0
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_cls.py b/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_cls.py
new file mode 100644
index 0000000000..0eabe876cb
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_cls.py
@@ -0,0 +1,104 @@
+from tensorflow.keras.applications import (
+    VGG16, vgg16,
+    VGG19, vgg19,
+
+    ResNet50, resnet,
+    ResNet101,
+    ResNet152,
+
+    DenseNet121, densenet,
+    DenseNet169,
+    DenseNet201,
+
+    InceptionResNetV2, inception_resnet_v2,
+    InceptionV3, inception_v3,
+
+    MobileNet, mobilenet,
+    MobileNetV2, mobilenet_v2,
+
+    NASNetLarge, nasnet,
+    NASNetMobile,
+
+    Xception, xception
+)
+
+from ..tf_model import TFModelPreparer
+from ..tf_model import (
+    TFModelProcessor,
+    TFDnnModelProcessor
+)
+from ...common.evaluation.classification.cls_data_fetcher import TFPreprocessedFetch
+from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
+from ...common.test.configs.default_preprocess_config import (
+    tf_input_blob,
+    pytorch_input_blob,
+    tf_model_blob_caffe_mode
+)
+from ...common.utils import set_tf_env, create_extended_parser
+
+model_dict = {
+    "vgg16": [VGG16, vgg16, tf_model_blob_caffe_mode],
+    "vgg19": [VGG19, vgg19, tf_model_blob_caffe_mode],
+
+    "resnet50": [ResNet50, resnet, tf_model_blob_caffe_mode],
+    "resnet101": [ResNet101, resnet, tf_model_blob_caffe_mode],
+    "resnet152": [ResNet152, resnet, tf_model_blob_caffe_mode],
+
+    "densenet121": [DenseNet121, densenet, pytorch_input_blob],
+    "densenet169": [DenseNet169, densenet, pytorch_input_blob],
+    "densenet201": [DenseNet201, densenet, pytorch_input_blob],
+
+    "inceptionresnetv2": [InceptionResNetV2, inception_resnet_v2, tf_input_blob],
+    "inceptionv3": [InceptionV3, inception_v3, tf_input_blob],
+
+    "mobilenet": [MobileNet, mobilenet, tf_input_blob],
+    "mobilenetv2": [MobileNetV2, mobilenet_v2, tf_input_blob],
+
+    "nasnetlarge": [NASNetLarge, nasnet, tf_input_blob],
+    "nasnetmobile": [NASNetMobile, nasnet, tf_input_blob],
+
+    "xception": [Xception, xception, tf_input_blob]
+}
+
+CNN_CLASS_ID = 0
+CNN_UTILS_ID = 1
+DEFAULT_BLOB_PARAMS_ID = 2
+
+
+class TFClsModel(TFModelPreparer):
+    def __init__(self, model_name, original_model):
+        super(TFClsModel, self).__init__(model_name, original_model)
+
+
+def main():
+    set_tf_env()
+
+    parser = create_extended_parser(list(model_dict.keys()))
+    cmd_args = parser.parse_args()
+
+    model_name = cmd_args.model_name
+    model_name_val = model_dict[model_name]
+
+    cls_model = TFClsModel(
+        model_name=model_name,
+        original_model=model_name_val[CNN_CLASS_ID](
+            include_top=True,
+            weights="imagenet"
+        )
+    )
+
+    tf_cls_pipeline = ClsModelTestPipeline(
+        network_model=cls_model,
+        model_processor=TFModelProcessor,
+        dnn_model_processor=TFDnnModelProcessor,
+        data_fetcher=TFPreprocessedFetch,
+        img_processor=model_name_val[CNN_UTILS_ID].preprocess_input,
+        cls_args_parser=parser,
+        default_input_blob_preproc=model_name_val[DEFAULT_BLOB_PARAMS_ID]
+    )
+
+    tf_cls_pipeline.init_test_pipeline()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_mobilenet.py b/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_mobilenet.py
new file mode 100644
index 0000000000..ebc6cfe542
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_mobilenet.py
@@ -0,0 +1,142 @@
+import os
+
+import cv2
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.applications import MobileNet
+from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
+
+from ...common.utils import set_tf_env
+
+
+def get_tf_model_proto(tf_model):
+    # define the directory for .pb model
+    pb_model_path = "models"
+
+    # define the name of .pb model
+    pb_model_name = "mobilenet.pb"
+
+    # create directory for further converted model
+    os.makedirs(pb_model_path, exist_ok=True)
+
+    # get model TF graph
+    tf_model_graph = tf.function(lambda x: tf_model(x))
+
+    # get concrete function
+    tf_model_graph = tf_model_graph.get_concrete_function(
+        tf.TensorSpec(tf_model.inputs[0].shape, tf_model.inputs[0].dtype))
+
+    # obtain frozen concrete function
+    frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
+    # get frozen graph
+    frozen_tf_func.graph.as_graph_def()
+
+    # save full tf model
+    tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
+                      logdir=pb_model_path,
+                      name=pb_model_name,
+                      as_text=False)
+
+    return os.path.join(pb_model_path, pb_model_name)
+
+
+def get_preprocessed_img(img_path):
+    # read the image
+    input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+    input_img = input_img.astype(np.float32)
+
+    # define preprocess parameters
+    mean = np.array([1.0, 1.0, 1.0]) * 127.5
+    scale = 1 / 127.5
+
+    # prepare input blob to fit the model input:
+    # 1. subtract mean
+    # 2. scale to set pixel values from 0 to 1
+    input_blob = cv2.dnn.blobFromImage(
+        image=input_img,
+        scalefactor=scale,
+        size=(224, 224),  # img target size
+        mean=mean,
+        swapRB=True,  # BGR -> RGB
+        crop=True  # center crop
+    )
+    print("Input blob shape: {}\n".format(input_blob.shape))
+
+    return input_blob
+
+
+def get_imagenet_labels(labels_path):
+    with open(labels_path) as f:
+        imagenet_labels = [line.strip() for line in f.readlines()]
+    return imagenet_labels
+
+
+def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
+    # set OpenCV DNN input
+    opencv_net.setInput(preproc_img)
+
+    # OpenCV DNN inference
+    out = opencv_net.forward()
+    print("OpenCV DNN prediction: \n")
+    print("* shape: ", out.shape)
+
+    # get the predicted class ID
+    imagenet_class_id = np.argmax(out)
+
+    # get confidence
+    confidence = out[0][imagenet_class_id]
+    print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+    print("* confidence: {:.4f}\n".format(confidence))
+
+
+def get_tf_dnn_prediction(original_net, preproc_img, imagenet_labels):
+    # inference
+    preproc_img = preproc_img.transpose(0, 2, 3, 1)
+    print("TF input blob shape: {}\n".format(preproc_img.shape))
+
+    out = original_net(preproc_img)
+
+    print("\nTensorFlow model prediction: \n")
+    print("* shape: ", out.shape)
+
+    # get the predicted class ID
+    imagenet_class_id = np.argmax(out)
+    print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
+
+    # get confidence
+    confidence = out[0][imagenet_class_id]
+    print("* confidence: {:.4f}".format(confidence))
+
+
+def main():
+    # configure TF launching
+    set_tf_env()
+
+    # initialize TF MobileNet model
+    original_tf_model = MobileNet(
+        include_top=True,
+        weights="imagenet"
+    )
+
+    # get TF frozen graph path
+    full_pb_path = get_tf_model_proto(original_tf_model)
+
+    # read frozen graph with OpenCV API
+    opencv_net = cv2.dnn.readNetFromTensorflow(full_pb_path)
+    print("OpenCV model was successfully read. Model layers: \n", opencv_net.getLayerNames())
+
+    # get preprocessed image
+    input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
+
+    # get ImageNet labels
+    imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
+
+    # obtain OpenCV DNN predictions
+    get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
+
+    # obtain TF model predictions
+    get_tf_dnn_prediction(original_tf_model, input_img, imagenet_labels)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/tf/detection/py_to_py_ssd_mobilenet.py b/samples/dnn/dnn_model_runner/dnn_conversion/tf/detection/py_to_py_ssd_mobilenet.py
new file mode 100644
index 0000000000..2468d151bb
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/tf/detection/py_to_py_ssd_mobilenet.py
@@ -0,0 +1,45 @@
+import os
+import tarfile
+import urllib
+
+DETECTION_MODELS_URL = 'http://download.tensorflow.org/models/object_detection/'
+
+
+def extract_tf_frozen_graph(model_name, extracted_model_path):
+    # define model archive name
+    tf_model_tar = model_name + '.tar.gz'
+    # define link to retrieve model archive
+    model_link = DETECTION_MODELS_URL + tf_model_tar
+
+    tf_frozen_graph_name = 'frozen_inference_graph'
+
+    try:
+        urllib.request.urlretrieve(model_link, tf_model_tar)
+    except Exception:
+        print("TF {} was not retrieved: {}".format(model_name, model_link))
+        return
+
+    print("TF {} was retrieved.".format(model_name))
+
+    tf_model_tar = tarfile.open(tf_model_tar)
+    frozen_graph_path = ""
+
+    for model_tar_elem in tf_model_tar.getmembers():
+        if tf_frozen_graph_name in os.path.basename(model_tar_elem.name):
+            tf_model_tar.extract(model_tar_elem, extracted_model_path)
+            frozen_graph_path = os.path.join(extracted_model_path, model_tar_elem.name)
+            break
+    tf_model_tar.close()
+
+    return frozen_graph_path
+
+
+def main():
+    tf_model_name = 'ssd_mobilenet_v1_coco_2017_11_17'
+    graph_extraction_dir = "./"
+    frozen_graph_path = extract_tf_frozen_graph(tf_model_name, graph_extraction_dir)
+    print("Frozen graph path for {}: {}".format(tf_model_name, frozen_graph_path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py b/samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
new file mode 100644
index 0000000000..2411821ed6
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
@@ -0,0 +1,112 @@
+import cv2
+import tensorflow as tf
+from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
+
+from ..common.abstract_model import AbstractModel, Framework
+from ..common.utils import DNN_LIB, get_full_model_path
+
+CURRENT_LIB = "TF"
+MODEL_FORMAT = ".pb"
+
+
+class TFModelPreparer(AbstractModel):
+    """ Class for the preparation of the TF models: original and converted OpenCV Net.
+
+    Args:
+        model_name: TF model name
+        original_model: TF configured model object or session
+        is_ready_graph: indicates whether ready .pb file already exists
+        tf_model_graph_path: path to the existing frozen TF graph
+    """
+
+    def __init__(
+            self,
+            model_name="default",
+            original_model=None,
+            is_ready_graph=False,
+            tf_model_graph_path=""
+    ):
+        self._model_name = model_name
+        self._original_model = original_model
+        self._model_to_save = ""
+
+        self._is_ready_to_transfer_graph = is_ready_graph
+        self.model_path = self._set_model_path(tf_model_graph_path)
+        self._dnn_model = self._set_dnn_model()
+
+    def _set_dnn_model(self):
+        if not self._is_ready_to_transfer_graph:
+            # get model TF graph
+            tf_model_graph = tf.function(lambda x: self._original_model(x))
+
+            tf_model_graph = tf_model_graph.get_concrete_function(
+                tf.TensorSpec(self._original_model.inputs[0].shape, self._original_model.inputs[0].dtype))
+
+            # obtain frozen concrete function
+            frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
+            frozen_tf_func.graph.as_graph_def()
+
+            # save full TF model
+            tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
+                              logdir=self.model_path["path"],
+                              name=self._model_to_save,
+                              as_text=False)
+
+        return cv2.dnn.readNetFromTensorflow(self.model_path["full_path"])
+
+    def _set_model_path(self, tf_pb_file_path):
+        """ Method for setting model paths.
+
+        Args:
+            tf_pb_file_path: path to the existing TF .pb
+
+        Returns:
+            dictionary, where full_path key means saved model path and its full name.
+        """
+        model_paths_dict = {
+            "path": "",
+            "full_path": tf_pb_file_path
+        }
+
+        if not self._is_ready_to_transfer_graph:
+            self._model_to_save = self._model_name + MODEL_FORMAT
+            model_paths_dict = get_full_model_path(CURRENT_LIB.lower(), self._model_to_save)
+
+        return model_paths_dict
+
+    def get_prepared_models(self):
+        original_lib_name = CURRENT_LIB + " " + self._model_name
+        configured_model_dict = {
+            original_lib_name: self._original_model,
+            DNN_LIB + " " + self._model_name: self._dnn_model
+        }
+        return configured_model_dict
+
+
+class TFModelProcessor(Framework):
+    def __init__(self, prepared_model, model_name):
+        self._prepared_model = prepared_model
+        self._name = model_name
+
+    def get_output(self, input_blob):
+        assert len(input_blob.shape) == 4
+        batch_tf = input_blob.transpose(0, 2, 3, 1)
+        out = self._prepared_model(batch_tf)
+        return out
+
+    def get_name(self):
+        return CURRENT_LIB
+
+
+class TFDnnModelProcessor(Framework):
+    def __init__(self, prepared_dnn_model, model_name):
+        self._prepared_dnn_model = prepared_dnn_model
+        self._name = model_name
+
+    def get_output(self, input_blob):
+        self._prepared_dnn_model.setInput(input_blob)
+        ret_val = self._prepared_dnn_model.forward()
+        return ret_val
+
+    def get_name(self):
+        return DNN_LIB
diff --git a/samples/dnn/download_models.py b/samples/dnn/download_models.py
new file mode 100644
index 0000000000..4d94765182
--- /dev/null
+++ b/samples/dnn/download_models.py
@@ -0,0 +1,364 @@
+'''
+Helper module to download extra data from Internet
+'''
+from __future__ import print_function
+import os
+import cv2
+import sys
+import yaml
+import argparse
+import tarfile
+import platform
+import tempfile
+import hashlib
+import requests
+import shutil
+from pathlib import Path
+from datetime import datetime
+if sys.version_info[0] < 3:
+    from urllib2 import urlopen
+else:
+    from urllib.request import urlopen
+import xml.etree.ElementTree as ET
+
+__all__ = ["downloadFile"]
+
+class HashMismatchException(Exception):
+    def __init__(self, expected, actual):
+        Exception.__init__(self)
+        self.expected = expected
+        self.actual = actual
+    def __str__(self):
+        return 'Hash mismatch: expected {} vs actual of {}'.format(self.expected, self.actual)
+
+def getHashsumFromFile(filepath):
+    sha = hashlib.sha1()
+    if os.path.exists(filepath):
+        print('  there is already a file with the same name')
+        with open(filepath, 'rb') as f:
+            while True:
+                buf = f.read(10*1024*1024)
+                if not buf:
+                    break
+                sha.update(buf)
+    hashsum = sha.hexdigest()
+    return hashsum
+
+def checkHashsum(expected_sha, filepath, silent=True):
+    print('  expected SHA1: {}'.format(expected_sha))
+    actual_sha = getHashsumFromFile(filepath)
+    print('  actual SHA1:{}'.format(actual_sha))
+    hashes_matched = expected_sha == actual_sha
+    if not hashes_matched and not silent:
+        raise HashMismatchException(expected_sha, actual_sha)
+    return hashes_matched
+
+def isArchive(filepath):
+    return tarfile.is_tarfile(filepath)
+
+class DownloadInstance:
+    def __init__(self, **kwargs):
+        self.name = kwargs.pop('name')
+        self.filename = kwargs.pop('filename')
+        self.loader = kwargs.pop('loader', None)
+        self.save_dir = kwargs.pop('save_dir')
+        self.sha = kwargs.pop('sha', None)
+
+    def __str__(self):
+        return 'DownloadInstance <{}>'.format(self.name)
+
+    def get(self):
+        print("  Working on " + self.name)
+        print("  Getting file " + self.filename)
+        if self.sha is None:
+            print('  No expected hashsum provided, loading file')
+        else:
+            filepath = os.path.join(self.save_dir, self.sha, self.filename)
+            if checkHashsum(self.sha, filepath):
+                print('  hash match - file already exists, skipping')
+                return filepath
+            else:
+                print('  hash didn\'t match, loading file')
+
+        if not os.path.exists(self.save_dir):
+            print('  creating directory: ' + self.save_dir)
+            os.makedirs(self.save_dir)
+
+
+        print('  hash check failed - loading')
+        assert self.loader
+        try:
+            self.loader.load(self.filename, self.sha, self.save_dir)
+            print(' done')
+            print(' file {}'.format(self.filename))
+            if self.sha is None:
+                download_path = os.path.join(self.save_dir, self.filename)
+                self.sha = getHashsumFromFile(download_path)
+                new_dir = os.path.join(self.save_dir, self.sha)
+
+                if not os.path.exists(new_dir):
+                    os.makedirs(new_dir)
+                filepath = os.path.join(new_dir, self.filename)
+                if not (os.path.exists(filepath)):
+                    shutil.move(download_path, new_dir)
+                print('  No expected hashsum provided, actual SHA is {}'.format(self.sha))
+            else:
+                checkHashsum(self.sha, filepath, silent=False)
+        except Exception as e:
+            print("  There was some problem with loading file {} for {}".format(self.filename, self.name))
+            print("  Exception: {}".format(e))
+            return
+
+        print("  Finished " + self.name)
+        return filepath
+
+class Loader(object):
+    MB = 1024*1024
+    BUFSIZE = 10*MB
+    def __init__(self, download_name, download_sha, archive_member = None):
+        self.download_name = download_name
+        self.download_sha = download_sha
+        self.archive_member = archive_member
+
+    def load(self, requested_file, sha, save_dir):
+        if self.download_sha is None:
+            download_dir = save_dir
+        else:
+            # create a new folder in save_dir to avoid possible name conflicts
+            download_dir = os.path.join(save_dir, self.download_sha)
+        if not os.path.exists(download_dir):
+            os.makedirs(download_dir)
+        download_path = os.path.join(download_dir, self.download_name)
+        print("  Preparing to download file " + self.download_name)
+        if checkHashsum(self.download_sha, download_path):
+            print('  hash match - file already exists, no need to download')
+        else:
+            filesize = self.download(download_path)
+            print('  Downloaded {} with size {} Mb'.format(self.download_name, filesize/self.MB))
+            if self.download_sha is not None:
+                checkHashsum(self.download_sha, download_path, silent=False)
+        if self.download_name == requested_file:
+            return
+        else:
+            if isArchive(download_path):
+                if sha is not None:
+                    extract_dir = os.path.join(save_dir, sha)
+                else:
+                    extract_dir = save_dir
+                if not os.path.exists(extract_dir):
+                    os.makedirs(extract_dir)
+                self.extract(requested_file, download_path, extract_dir)
+            else:
+                raise Exception("Downloaded file has different name")
+
+    def download(self, filepath):
+        print("Warning: download is not implemented, this is a base class")
+        return 0
+
+    def extract(self, requested_file, archive_path, save_dir):
+        filepath = os.path.join(save_dir, requested_file)
+        try:
+            with tarfile.open(archive_path) as f:
+                if self.archive_member is None:
+                    pathDict = dict((os.path.split(elem)[1], os.path.split(elem)[0]) for elem in f.getnames())
+                    self.archive_member = pathDict[requested_file]
+                assert self.archive_member in f.getnames()
+                self.save(filepath, f.extractfile(self.archive_member))
+        except Exception as e:
+            print('  catch {}'.format(e))
+
+    def save(self, filepath, r):
+        with open(filepath, 'wb') as f:
+            print('  progress ', end="")
+            sys.stdout.flush()
+            while True:
+                buf = r.read(self.BUFSIZE)
+                if not buf:
+                    break
+                f.write(buf)
+                print('>', end="")
+                sys.stdout.flush()
+
+class URLLoader(Loader):
+    def __init__(self, download_name, download_sha, url, archive_member = None):
+        super(URLLoader, self).__init__(download_name, download_sha, archive_member)
+        self.download_name = download_name
+        self.download_sha = download_sha
+        self.url = url
+
+    def download(self, filepath):
+        r = urlopen(self.url, timeout=60)
+        self.printRequest(r)
+        self.save(filepath, r)
+        return os.path.getsize(filepath)
+
+    def printRequest(self, r):
+        def getMB(r):
+            d = dict(r.info())
+            for c in ['content-length', 'Content-Length']:
+                if c in d:
+                    return int(d[c]) / self.MB
+            return '<unknown>'
+        print('  {} {} [{} Mb]'.format(r.getcode(), r.msg, getMB(r)))
+
+class GDriveLoader(Loader):
+    BUFSIZE = 1024 * 1024
+    PROGRESS_SIZE = 10 * 1024 * 1024
+    def __init__(self, download_name, download_sha, gid, archive_member = None):
+        super(GDriveLoader, self).__init__(download_name, download_sha, archive_member)
+        self.download_name = download_name
+        self.download_sha = download_sha
+        self.gid = gid
+
+    def download(self, filepath):
+        session = requests.Session()  # re-use cookies
+
+        URL = "https://docs.google.com/uc?export=download"
+        response = session.get(URL, params = { 'id' : self.gid }, stream = True)
+
+        def get_confirm_token(response):  # in case of large files
+            for key, value in response.cookies.items():
+                if key.startswith('download_warning'):
+                    return value
+            return None
+        token = get_confirm_token(response)
+
+        if token:
+            params = { 'id' : self.gid, 'confirm' : token }
+            response = session.get(URL, params = params, stream = True)
+
+        sz = 0
+        progress_sz = self.PROGRESS_SIZE
+        with open(filepath, "wb") as f:
+            for chunk in response.iter_content(self.BUFSIZE):
+                if not chunk:
+                    continue  # keep-alive
+
+                f.write(chunk)
+                sz += len(chunk)
+                if sz >= progress_sz:
+                    progress_sz += self.PROGRESS_SIZE
+                    print('>', end='')
+                    sys.stdout.flush()
+        print('')
+        return sz
+
+def produceDownloadInstance(instance_name, filename, sha, url, save_dir, download_name=None, download_sha=None, archive_member=None):
+    spec_param = url
+    loader = URLLoader
+    if download_name is None:
+        download_name = filename
+    if download_sha is None:
+        download_sha = sha
+    if "drive.google.com" in url:
+        token = ""
+        token_part = url.rsplit('/', 1)[-1]
+        if "&id=" not in token_part:
+            token_part = url.rsplit('/', 1)[-2]
+        for param in token_part.split("&"):
+            if param.startswith("id="):
+                token = param[3:]
+        if token:
+            loader = GDriveLoader
+            spec_param = token
+        else:
+            print("Warning: possibly wrong Google Drive link")
+    return DownloadInstance(
+        name=instance_name,
+        filename=filename,
+        sha=sha,
+        save_dir=save_dir,
+        loader=loader(download_name, download_sha, spec_param, archive_member)
+    )
+
+def getSaveDir():
+    env_path = os.environ.get("OPENCV_DOWNLOAD_DATA_PATH", None)
+    if env_path:
+        save_dir = env_path
+    else:
+        # TODO reuse binding function cv2.utils.fs.getCacheDirectory when issue #19011 is fixed
+        if platform.system() == "Darwin":
+            #On Apple devices
+            temp_env = os.environ.get("TMPDIR", None)
+            if temp_env is None or not os.path.isdir(temp_env):
+                temp_dir = Path("/tmp")
+                print("Using world accessible cache directory. This may be not secure: ", temp_dir)
+            else:
+                temp_dir = temp_env
+        elif platform.system() == "Windows":
+            temp_dir = tempfile.gettempdir()
+        else:
+            xdg_cache_env = os.environ.get("XDG_CACHE_HOME", None)
+            if (xdg_cache_env and xdg_cache_env[0] and os.path.isdir(xdg_cache_env)):
+                temp_dir = xdg_cache_env
+            else:
+                home_env = os.environ.get("HOME", None)
+                if (home_env and home_env[0] and os.path.isdir(home_env)):
+                    home_path = os.path.join(home_env, ".cache/")
+                    if os.path.isdir(home_path):
+                        temp_dir = home_path
+                else:
+                    temp_dir = tempfile.gettempdir()
+                    print("Using world accessible cache directory. This may be not secure: ", temp_dir)
+
+        save_dir = os.path.join(temp_dir, "downloads")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    return save_dir
+
+def downloadFile(url, sha=None, save_dir=None, filename=None):
+    if save_dir is None:
+        save_dir = getSaveDir()
+    if filename is None:
+        filename = "download_" + datetime.now().__str__()
+    name = filename
+    return produceDownloadInstance(name, filename, sha, url, save_dir).get()
+
+def parseMetalinkFile(metalink_filepath, save_dir):
+    NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
+    models = []
+    for file_elem in ET.parse(metalink_filepath).getroot().findall('ml:file', NS):
+        url = file_elem.find('ml:url', NS).text
+        fname = file_elem.attrib['name']
+        name = file_elem.find('ml:identity', NS).text
+        hash_sum = file_elem.find('ml:hash', NS).text
+        models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir))
+    return models
+
+def parseYAMLFile(yaml_filepath, save_dir):
+    models = []
+    with open(yaml_filepath, 'r') as stream:
+        data_loaded = yaml.safe_load(stream)
+        for name, params in data_loaded.items():
+            load_info = params.get("load_info", None)
+            if load_info:
+                fname = os.path.basename(params.get("model"))
+                hash_sum = load_info.get("sha1")
+                url = load_info.get("url")
+                download_sha = load_info.get("download_sha")
+                download_name = load_info.get("download_name")
+                archive_member = load_info.get("member")
+                models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
+                    download_name=download_name, download_sha=download_sha, archive_member=archive_member))
+
+    return models
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='This is a utility script for downloading DNN models for samples.')
+
+    parser.add_argument('--save_dir', action="store", default=os.getcwd(),
+                        help='Path to the directory to store downloaded files')
+    parser.add_argument('model_name', type=str, default="", nargs='?', action="store",
+                        help='name of the model to download')
+    args = parser.parse_args()
+    models = []
+    save_dir = args.save_dir
+    selected_model_name = args.model_name
+    models.extend(parseMetalinkFile('face_detector/weights.meta4', save_dir))
+    models.extend(parseYAMLFile('models.yml', save_dir))
+    for m in models:
+        print(m)
+        if selected_model_name and not m.name.startswith(selected_model_name):
+            continue
+        print('Model: ' + selected_model_name)
+        m.get()
\ No newline at end of file
diff --git a/samples/dnn/face_detector/download_weights.py b/samples/dnn/face_detector/download_weights.py
deleted file mode 100755
index f872190d02..0000000000
--- a/samples/dnn/face_detector/download_weights.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-import hashlib
-import time
-import sys
-import xml.etree.ElementTree as ET
-if sys.version_info[0] < 3:
-    from urllib2 import urlopen
-else:
-    from urllib.request import urlopen
-
-class HashMismatchException(Exception):
-    def __init__(self, expected, actual):
-        Exception.__init__(self)
-        self.expected = expected
-        self.actual = actual
-    def __str__(self):
-        return 'Hash mismatch: {} vs {}'.format(self.expected, self.actual)
-
-class MetalinkDownloader(object):
-    BUFSIZE = 10*1024*1024
-    NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
-    tick = 0
-
-    def download(self, metalink_file):
-        status = True
-        for file_elem in ET.parse(metalink_file).getroot().findall('ml:file', self.NS):
-            url = file_elem.find('ml:url', self.NS).text
-            fname = file_elem.attrib['name']
-            hash_sum = file_elem.find('ml:hash', self.NS).text
-            print('*** {}'.format(fname))
-            try:
-                self.verify(hash_sum, fname)
-            except Exception as ex:
-                print('  {}'.format(ex))
-                try:
-                    print('  {}'.format(url))
-                    with open(fname, 'wb') as file_stream:
-                        self.buffered_read(urlopen(url), file_stream.write)
-                    self.verify(hash_sum, fname)
-                except Exception as ex:
-                    print('  {}'.format(ex))
-                    print('  FAILURE')
-                    status = False
-                    continue
-            print('  SUCCESS')
-        return status
-
-    def print_progress(self, msg, timeout = 0):
-        if time.time() - self.tick > timeout:
-            print(msg, end='')
-            sys.stdout.flush()
-            self.tick = time.time()
-
-    def buffered_read(self, in_stream, processing):
-        self.print_progress('  >')
-        while True:
-            buf = in_stream.read(self.BUFSIZE)
-            if not buf:
-                break
-            processing(buf)
-            self.print_progress('>', 5)
-        print(' done')
-
-    def verify(self, hash_sum, fname):
-        sha = hashlib.sha1()
-        with open(fname, 'rb') as file_stream:
-            self.buffered_read(file_stream, sha.update)
-        if hash_sum != sha.hexdigest():
-            raise HashMismatchException(hash_sum, sha.hexdigest())
-
-if __name__ == '__main__':
-    sys.exit(0 if MetalinkDownloader().download('weights.meta4') else 1)
diff --git a/samples/dnn/face_detector/weights.meta4 b/samples/dnn/face_detector/weights.meta4
index 35d303085b..ba75e12d1b 100644
--- a/samples/dnn/face_detector/weights.meta4
+++ b/samples/dnn/face_detector/weights.meta4
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <metalink xmlns="urn:ietf:params:xml:ns:metalink">
     <file name="res10_300x300_ssd_iter_140000_fp16.caffemodel">
-        <identity>OpenCV face detector FP16 weights</identity>
+        <identity>opencv_face_detector_fp16</identity>
         <hash type="sha-1">31fc22bfdd907567a04bb45b7cfad29966caddc1</hash>
         <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel</url>
     </file>
     <file name="opencv_face_detector_uint8.pb">
-        <identity>OpenCV face detector UINT8 weights</identity>
+        <identity>opencv_face_detector_uint8</identity>
         <hash type="sha-1">4f2fdf6f231d759d7bbdb94353c5a68690f3d2ae</hash>
         <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180220_uint8/opencv_face_detector_uint8.pb</url>
     </file>
diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml
index 7a0bfaf350..86e86925df 100644
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@@ -1,11 +1,14 @@
-%YAML:1.0
-
+%YAML 1.0
+---
 ################################################################################
 # Object detection models.
 ################################################################################
 
 # OpenCV's face detection network
 opencv_fd:
+  load_info:
+    url: "https://github.com/opencv/opencv_3rdparty/raw/dnn_samples_face_detector_20170830/res10_300x300_ssd_iter_140000.caffemodel"
+    sha1: "15aa726b4d46d9f023526d85537db81cbc8dd566"
   model: "opencv_face_detector.caffemodel"
   config: "opencv_face_detector.prototxt"
   mean: [104, 177, 123]
@@ -19,6 +22,9 @@ opencv_fd:
 # YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/)
 # Might be used for all YOLOv2, TinyYolov2, YOLOv3, YOLOv4 and TinyYolov4
 yolo:
+  load_info:
+    url: "https://pjreddie.com/media/files/yolov3.weights"
+    sha1: "520878f12e97cf820529daea502acca380f1cb8e"
   model: "yolov3.weights"
   config: "yolov3.cfg"
   mean: [0, 0, 0]
@@ -30,6 +36,9 @@ yolo:
   sample: "object_detection"
 
 tiny-yolo-voc:
+  load_info:
+    url: "https://pjreddie.com/media/files/yolov2-tiny-voc.weights"
+    sha1: "24b4bd049fc4fa5f5e95f684a8967e65c625dff9"
   model: "tiny-yolo-voc.weights"
   config: "tiny-yolo-voc.cfg"
   mean: [0, 0, 0]
@@ -42,6 +51,9 @@ tiny-yolo-voc:
 
 # Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD
 ssd_caffe:
+  load_info:
+    url: "https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc"
+    sha1: "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a"
   model: "MobileNetSSD_deploy.caffemodel"
   config: "MobileNetSSD_deploy.prototxt"
   mean: [127.5, 127.5, 127.5]
@@ -54,6 +66,12 @@ ssd_caffe:
 
 # TensorFlow implementation of SSD model from https://github.com/tensorflow/models/tree/master/research/object_detection
 ssd_tf:
+  load_info:
+    url: "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
+    sha1: "9e4bcdd98f4c6572747679e4ce570de4f03a70e2"
+    download_sha: "6157ddb6da55db2da89dd561eceb7f944928e317"
+    download_name: "ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
+    member: "ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb"
   model: "ssd_mobilenet_v1_coco_2017_11_17.pb"
   config: "ssd_mobilenet_v1_coco_2017_11_17.pbtxt"
   mean: [0, 0, 0]
@@ -66,6 +84,12 @@ ssd_tf:
 
 # TensorFlow implementation of Faster-RCNN model from https://github.com/tensorflow/models/tree/master/research/object_detection
 faster_rcnn_tf:
+  load_info:
+    url: "http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
+    sha1: "f2e4bf386b9bb3e25ddfcbbd382c20f417e444f3"
+    download_sha: "c710f25e5c6a3ce85fe793d5bf266d581ab1c230"
+    download_name: "faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
+    member: "faster_rcnn_inception_v2_coco_2018_01_28/frozen_inference_graph.pb"
   model: "faster_rcnn_inception_v2_coco_2018_01_28.pb"
   config: "faster_rcnn_inception_v2_coco_2018_01_28.pbtxt"
   mean: [0, 0, 0]
@@ -81,6 +105,9 @@ faster_rcnn_tf:
 
 # SqueezeNet v1.1 from https://github.com/DeepScale/SqueezeNet
 squeezenet:
+  load_info:
+    url: "https://raw.githubusercontent.com/DeepScale/SqueezeNet/b5c3f1a23713c8b3fd7b801d229f6b04c64374a5/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel"
+    sha1: "3397f026368a45ae236403ccc81cfcbe8ebe1bd0"
   model: "squeezenet_v1.1.caffemodel"
   config: "squeezenet_v1.1.prototxt"
   mean: [0, 0, 0]
@@ -93,6 +120,9 @@ squeezenet:
 
 # Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
 googlenet:
+  load_info:
+    url: "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel"
+    sha1: "405fc5acd08a3bb12de8ee5e23a96bec22f08204"
   model: "bvlc_googlenet.caffemodel"
   config: "bvlc_googlenet.prototxt"
   mean: [104, 117, 123]
@@ -110,6 +140,9 @@ googlenet:
 # ENet road scene segmentation network from https://github.com/e-lab/ENet-training
 # Works fine for different input sizes.
 enet:
+  load_info:
+    url: "https://www.dropbox.com/s/tdde0mawbi5dugq/Enet-model-best.net?dl=1"
+    sha1: "b4123a73bf464b9ebe9cfc4ab9c2d5c72b161315"
   model: "Enet-model-best.net"
   mean: [0, 0, 0]
   scale: 0.00392
@@ -120,6 +153,9 @@ enet:
   sample: "segmentation"
 
 fcn8s:
+  load_info:
+    url: "http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel"
+    sha1: "c449ea74dd7d83751d1357d6a8c323fcf4038962"
   model: "fcn8s-heavy-pascal.caffemodel"
   config: "fcn8s-heavy-pascal.prototxt"
   mean: [0, 0, 0]
diff --git a/samples/dnn/person_reid.cpp b/samples/dnn/person_reid.cpp
new file mode 100644
index 0000000000..23b766114c
--- /dev/null
+++ b/samples/dnn/person_reid.cpp
@@ -0,0 +1,240 @@
+//
+// You can download a baseline ReID model and sample input from:
+// https://github.com/ReID-Team/ReID_extra_testdata
+//
+// Authors of samples and Youtu ReID baseline:
+//         Xing Sun <winfredsun@tencent.com>
+//         Feng Zheng <zhengf@sustech.edu.cn>
+//         Xinyang Jiang <sevjiang@tencent.com>
+//         Fufu Yu <fufuyu@tencent.com>
+//         Enwei Zhang <miyozhang@tencent.com>
+//
+// Copyright (C) 2020-2021, Tencent.
+// Copyright (C) 2020-2021, SUSTech.
+//
+#include <iostream>
+#include <fstream>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+const char* keys =
+"{help    h  |                 | show help message}"
+"{model   m  |                 | network model}"
+"{query_list q |               | list of query images}"
+"{gallery_list g |             | list of gallery images}"
+"{batch_size | 32              | batch size of each inference}"
+"{resize_h   | 256             | resize input to specific height.}"
+"{resize_w   | 128             | resize input to specific width.}"
+"{topk k     | 5               | number of gallery images showed in visualization}"
+"{output_dir |                 | path for visualization(it should be existed)}"
+"{backend b  | 0               | choose one of computation backends: "
+"0: automatically (by default), "
+"1: Halide language (http://halide-lang.org/), "
+"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+"3: OpenCV implementation ,"
+"5: CUDA }"
+"{target  t  | 0                | choose one of target computation devices: "
+"0: CPU target (by default), "
+"1: OpenCL, "
+"2: OpenCL fp16 (half-float precision), "
+"6: CUDA ,"
+"7: CUDA fp16 (half-float preprocess) }";
+
+namespace cv{
+namespace reid{
+
+static Mat preprocess(const Mat& img)
+{
+    const double mean[3] = {0.485, 0.456, 0.406};
+    const double std[3] = {0.229, 0.224, 0.225};
+    Mat ret = Mat(img.rows, img.cols, CV_32FC3);
+    for (int y = 0; y < ret.rows; y ++)
+    {
+        for (int x = 0; x < ret.cols; x++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                ret.at<Vec3f>(y,x)[c] = (float)((img.at<Vec3b>(y,x)[c] / 255.0 - mean[2 - c]) / std[2 - c]);
+            }
+        }
+    }
+    return ret;
+}
+
+static std::vector<float> normalization(const std::vector<float>& feature)
+{
+    std::vector<float> ret;
+    float sum = 0.0;
+    for(int i = 0; i < (int)feature.size(); i++)
+    {
+        sum += feature[i] * feature[i];
+    }
+    sum = sqrt(sum);
+    for(int i = 0; i < (int)feature.size(); i++)
+    {
+        ret.push_back(feature[i] / sum);
+    }
+    return ret;
+}
+
+static void extractFeatures(const std::vector<std::string>& imglist, Net* net, const int& batch_size, const int& resize_h, const int& resize_w, std::vector<std::vector<float>>& features)
+{
+    for(int st = 0; st < (int)imglist.size(); st += batch_size)
+    {
+        std::vector<Mat> batch;
+        for(int delta = 0; delta < batch_size && st + delta < (int)imglist.size(); delta++)
+        {
+            Mat img = imread(imglist[st + delta]);
+            batch.push_back(preprocess(img));
+        }
+        Mat blob = dnn::blobFromImages(batch, 1.0, Size(resize_w, resize_h), Scalar(0.0,0.0,0.0), true, false, CV_32F);
+        net->setInput(blob);
+        Mat out = net->forward();
+        for(int i = 0; i < (int)out.size().height; i++)
+        {
+            std::vector<float> temp_feature;
+            for(int j = 0; j < (int)out.size().width; j++)
+            {
+                temp_feature.push_back(out.at<float>(i,j));
+            }
+            features.push_back(normalization(temp_feature));
+        }
+    }
+    return ;
+}
+
+static void getNames(const std::string& ImageList, std::vector<std::string>& result)
+{
+    std::ifstream img_in(ImageList);
+    std::string img_name;
+    while(img_in >> img_name)
+    {
+        result.push_back(img_name);
+    }
+    return ;
+}
+
+static float similarity(const std::vector<float>& feature1, const std::vector<float>& feature2)
+{
+    float result = 0.0;
+    for(int i = 0; i < (int)feature1.size(); i++)
+    {
+        result += feature1[i] * feature2[i];
+    }
+    return result;
+}
+
+static void getTopK(const std::vector<std::vector<float>>& queryFeatures, const std::vector<std::vector<float>>& galleryFeatures, const int& topk, std::vector<std::vector<int>>& result)
+{
+    for(int i = 0; i < (int)queryFeatures.size(); i++)
+    {
+        std::vector<float> similarityList;
+        std::vector<int> index;
+        for(int j = 0; j < (int)galleryFeatures.size(); j++)
+        {
+            similarityList.push_back(similarity(queryFeatures[i], galleryFeatures[j]));
+            index.push_back(j);
+        }
+        sort(index.begin(), index.end(), [&](int x,int y){return similarityList[x] > similarityList[y];});
+        std::vector<int> topk_result;
+        for(int j = 0; j < min(topk, (int)index.size()); j++)
+        {
+            topk_result.push_back(index[j]);
+        }
+        result.push_back(topk_result);
+    }
+    return ;
+}
+
+static void addBorder(const Mat& img, const Scalar& color, Mat& result)
+{
+    const int bordersize = 5;
+    copyMakeBorder(img, result, bordersize, bordersize, bordersize, bordersize, cv::BORDER_CONSTANT, color);
+    return ;
+}
+
+static void drawRankList(const std::string& queryName, const std::vector<std::string>& galleryImageNames, const std::vector<int>& topk_index, const int& resize_h, const int& resize_w, Mat& result)
+{
+    const Size outputSize = Size(resize_w, resize_h);
+    Mat q_img = imread(queryName), temp_img;
+    resize(q_img, temp_img, outputSize);
+    addBorder(temp_img, Scalar(0,0,0), q_img);
+    putText(q_img, "Query", Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
+    std::vector<Mat> Images;
+    Images.push_back(q_img);
+    for(int i = 0; i < (int)topk_index.size(); i++)
+    {
+        Mat g_img = imread(galleryImageNames[topk_index[i]]);
+        resize(g_img, temp_img, outputSize);
+        addBorder(temp_img, Scalar(255,255,255), g_img);
+        putText(g_img, "G" + std::to_string(i), Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
+        Images.push_back(g_img);
+    }
+    hconcat(Images, result);
+    return ;
+}
+
+static void visualization(const std::vector<std::vector<int>>& topk, const std::vector<std::string>& queryImageNames, const std::vector<std::string>& galleryImageNames, const std::string& output_dir, const int& resize_h, const int& resize_w)
+{
+    for(int i = 0; i < (int)queryImageNames.size(); i++)
+    {
+        Mat img;
+        drawRankList(queryImageNames[i], galleryImageNames, topk[i], resize_h, resize_w, img);
+        std::string output_path = output_dir + "/" + queryImageNames[i].substr(queryImageNames[i].rfind("/")+1);
+        imwrite(output_path, img);
+    }
+    return ;
+}
+
+};
+};
+
+int main(int argc, char** argv)
+{
+    // Parse command line arguments.
+    CommandLineParser parser(argc, argv, keys);
+
+    if (argc == 1 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+    parser = CommandLineParser(argc, argv, keys);
+    parser.about("Use this script to run ReID networks using OpenCV.");
+
+    const std::string modelPath = parser.get<String>("model");
+    const std::string queryImageList = parser.get<String>("query_list");
+    const std::string galleryImageList = parser.get<String>("gallery_list");
+    const int backend = parser.get<int>("backend");
+    const int target = parser.get<int>("target");
+    const int batch_size = parser.get<int>("batch_size");
+    const int resize_h = parser.get<int>("resize_h");
+    const int resize_w = parser.get<int>("resize_w");
+    const int topk = parser.get<int>("topk");
+    const std::string output_dir= parser.get<String>("output_dir");
+
+    std::vector<std::string> queryImageNames;
+    reid::getNames(queryImageList, queryImageNames);
+    std::vector<std::string> galleryImageNames;
+    reid::getNames(galleryImageList, galleryImageNames);
+
+    dnn::Net net = dnn::readNet(modelPath);
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    std::vector<std::vector<float>> queryFeatures;
+    reid::extractFeatures(queryImageNames, &net, batch_size, resize_h, resize_w, queryFeatures);
+    std::vector<std::vector<float>> galleryFeatures;
+    reid::extractFeatures(galleryImageNames, &net, batch_size, resize_h, resize_w, galleryFeatures);
+
+    std::vector<std::vector<int>> topkResult;
+    reid::getTopK(queryFeatures, galleryFeatures, topk, topkResult);
+    reid::visualization(topkResult, queryImageNames, galleryImageNames, output_dir, resize_h, resize_w);
+
+    return 0;
+}
diff --git a/samples/dnn/person_reid.py b/samples/dnn/person_reid.py
new file mode 100644
index 0000000000..502f126bd5
--- /dev/null
+++ b/samples/dnn/person_reid.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+'''
+You can download a baseline ReID model and sample input from:
+https://github.com/ReID-Team/ReID_extra_testdata
+
+Authors of samples and Youtu ReID baseline:
+        Xing Sun <winfredsun@tencent.com>
+        Feng Zheng <zhengf@sustech.edu.cn>
+        Xinyang Jiang <sevjiang@tencent.com>
+        Fufu Yu <fufuyu@tencent.com>
+        Enwei Zhang <miyozhang@tencent.com>
+
+Copyright (C) 2020-2021, Tencent.
+Copyright (C) 2020-2021, SUSTech.
+'''
+import argparse
+import os.path
+import numpy as np
+import cv2 as cv
+
+backends = (cv.dnn.DNN_BACKEND_DEFAULT,
+    cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
+    cv.dnn.DNN_BACKEND_OPENCV,
+    cv.dnn.DNN_BACKEND_CUDA)
+
+targets = (cv.dnn.DNN_TARGET_CPU,
+    cv.dnn.DNN_TARGET_OPENCL,
+    cv.dnn.DNN_TARGET_OPENCL_FP16,
+    cv.dnn.DNN_TARGET_MYRIAD,
+    cv.dnn.DNN_TARGET_HDDL,
+    cv.dnn.DNN_TARGET_CUDA,
+    cv.dnn.DNN_TARGET_CUDA_FP16)
+
+MEAN = (0.485, 0.456, 0.406)
+STD = (0.229, 0.224, 0.225)
+
+def preprocess(images, height, width):
+    """
+    Create 4-dimensional blob from image
+    :param image: input image
+    :param height: the height of the resized input image
+    :param width: the width of the resized input image
+    """
+    img_list = []
+    for image in images:
+        image = cv.resize(image, (width, height))
+        img_list.append(image[:, :, ::-1])
+
+    images = np.array(img_list)
+    images = (images / 255.0 - MEAN) / STD
+
+    input = cv.dnn.blobFromImages(images.astype(np.float32), ddepth = cv.CV_32F)
+    return input
+
+def extract_feature(img_dir, model_path, batch_size = 32, resize_h = 384, resize_w = 128, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
+    """
+    Extract features from images in a target directory
+    :param img_dir: the input image directory
+    :param model_path: path to ReID model
+    :param batch_size: the batch size for each network inference iteration
+    :param resize_h: the height of the input image
+    :param resize_w: the width of the input image
+    :param backend: name of computation backend
+    :param target: name of computation target
+    """
+    feat_list = []
+    path_list = os.listdir(img_dir)
+    path_list = [os.path.join(img_dir, img_name) for img_name in path_list]
+    count = 0
+
+    for i in range(0, len(path_list), batch_size):
+        print('Feature Extraction for images in', img_dir, 'Batch:', count, '/', len(path_list))
+        batch = path_list[i : min(i + batch_size, len(path_list))]
+        imgs = read_data(batch)
+        inputs = preprocess(imgs, resize_h, resize_w)
+
+        feat = run_net(inputs, model_path, backend, target)
+
+        feat_list.append(feat)
+        count += batch_size
+
+    feats = np.concatenate(feat_list, axis = 0)
+    return feats, path_list
+
+def run_net(inputs, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
+    """
+    Forword propagation for a batch of images.
+    :param inputs: input batch of images
+    :param model_path: path to ReID model
+    :param backend: name of computation backend
+    :param target: name of computation target
+    """
+    net = cv.dnn.readNet(model_path)
+    net.setPreferableBackend(backend)
+    net.setPreferableTarget(target)
+    net.setInput(inputs)
+    out = net.forward()
+    out = np.reshape(out, (out.shape[0], out.shape[1]))
+    return out
+
+def read_data(path_list):
+    """
+    Read all images from a directory into a list
+    :param path_list: the list of image path
+    """
+    img_list = []
+    for img_path in path_list:
+        img = cv.imread(img_path)
+        if img is None:
+            continue
+        img_list.append(img)
+    return img_list
+
+def normalize(nparray, order=2, axis=0):
+    """
+    Normalize a N-D numpy array along the specified axis.
+    :param nparry: the array of vectors to be normalized
+    :param order: order of the norm
+    :param axis: the axis of x along which to compute the vector norms
+    """
+    norm = np.linalg.norm(nparray, ord=order, axis=axis, keepdims=True)
+    return nparray / (norm + np.finfo(np.float32).eps)
+
+def similarity(array1, array2):
+    """
+    Compute the euclidean or cosine distance of all pairs.
+    :param  array1: numpy array with shape [m1, n]
+    :param  array2: numpy array with shape [m2, n]
+    Returns:
+      numpy array with shape [m1, m2]
+    """
+    array1 = normalize(array1, axis=1)
+    array2 = normalize(array2, axis=1)
+    dist = np.matmul(array1, array2.T)
+    return dist
+
+def topk(query_feat, gallery_feat, topk = 5):
+    """
+    Return the index of top K gallery images most similar to the query images
+    :param query_feat: array of feature vectors of query images
+    :param gallery_feat: array of feature vectors of gallery images
+    :param topk: number of gallery images to return
+    """
+    sim = similarity(query_feat, gallery_feat)
+    index = np.argsort(-sim, axis = 1)
+    return [i[0:int(topk)] for i in index]
+
+def drawRankList(query_name, gallery_list, output_size = (128, 384)):
+    """
+    Draw the rank list
+    :param query_name: path of the query image
+    :param gallery_name: path of the gallery image
+    "param output_size: the output size of each image in the rank list
+    """
+    def addBorder(im, color):
+        bordersize = 5
+        border = cv.copyMakeBorder(
+            im,
+            top = bordersize,
+            bottom = bordersize,
+            left = bordersize,
+            right = bordersize,
+            borderType = cv.BORDER_CONSTANT,
+            value = color
+        )
+        return border
+    query_img = cv.imread(query_name)
+    query_img = cv.resize(query_img, output_size)
+    query_img = addBorder(query_img, [0, 0, 0])
+    cv.putText(query_img, 'Query', (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
+
+    gallery_img_list = []
+    for i, gallery_name in enumerate(gallery_list):
+        gallery_img = cv.imread(gallery_name)
+        gallery_img = cv.resize(gallery_img, output_size)
+        gallery_img = addBorder(gallery_img, [255, 255, 255])
+        cv.putText(gallery_img, 'G%02d'%i, (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
+        gallery_img_list.append(gallery_img)
+    ret = np.concatenate([query_img] + gallery_img_list, axis = 1)
+    return ret
+
+
+def visualization(topk_idx, query_names, gallery_names, output_dir = 'vis'):
+    """
+    Visualize the retrieval results with the person ReID model
+    :param topk_idx: the index of ranked gallery images for each query image
+    :param query_names: the list of paths of query images
+    :param gallery_names: the list of paths of gallery images
+    :param output_dir: the path to save the visualize results
+    """
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    for i, idx in enumerate(topk_idx):
+        query_name = query_names[i]
+        topk_names = [gallery_names[j] for j in idx]
+        vis_img = drawRankList(query_name, topk_names)
+        output_path = os.path.join(output_dir, '%03d_%s'%(i, os.path.basename(query_name)))
+        cv.imwrite(output_path, vis_img)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--query_dir', '-q', required=True, help='Path to query image.')
+    parser.add_argument('--gallery_dir', '-g', required=True, help='Path to gallery directory.')
+    parser.add_argument('--resize_h', default = 256, help='The height of the input for model inference.')
+    parser.add_argument('--resize_w', default = 128, help='The width of the input for model inference')
+    parser.add_argument('--model', '-m', default='reid.onnx', help='Path to pb model.')
+    parser.add_argument('--visualization_dir', default='vis', help='Path for the visualization results')
+    parser.add_argument('--topk', default=10, help='Number of images visualized in the rank list')
+    parser.add_argument('--batchsize', default=32, help='The batch size of each inference')
+    parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
+                        help="Choose one of computation backends: "
+                             "%d: automatically (by default), "
+                             "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                             "%d: OpenCV implementation"
+                             "%d: CUDA backend"% backends)
+    parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
+                        help='Choose one of target computation devices: '
+                             '%d: CPU target (by default), '
+                             '%d: OpenCL, '
+                             '%d: OpenCL fp16 (half-float precision), '
+                             '%d: NCS2 VPU, '
+                             '%d: HDDL VPU'
+                             '%d: CUDA,'
+                             '%d: CUDA FP16,'
+                             % targets)
+    args, _ = parser.parse_known_args()
+
+    if not os.path.isfile(args.model):
+        raise OSError("Model not exist")
+
+    query_feat, query_names = extract_feature(args.query_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
+    gallery_feat, gallery_names = extract_feature(args.gallery_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
+
+    topk_idx = topk(query_feat, gallery_feat, args.topk)
+    visualization(topk_idx, query_names, gallery_names, output_dir = args.visualization_dir)
diff --git a/samples/dnn/scene_text_detection.cpp b/samples/dnn/scene_text_detection.cpp
new file mode 100644
index 0000000000..156529937a
--- /dev/null
+++ b/samples/dnn/scene_text_detection.cpp
@@ -0,0 +1,165 @@
+#include <iostream>
+#include <fstream>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn/dnn.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+std::string keys =
+        "{ help  h                          | | Print help message. }"
+        "{ inputImage i                     | | Path to an input image. Skip this argument to capture frames from a camera. }"
+        "{ modelPath mp                     | | Path to a binary .onnx file contains trained DB detector model. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+        "{ inputHeight ih                    |736| image height of the model input. It should be multiple by 32.}"
+        "{ inputWidth iw                     |736| image width of the model input. It should be multiple by 32.}"
+        "{ binaryThreshold bt               |0.3| Confidence threshold of the binary map. }"
+        "{ polygonThreshold pt              |0.5| Confidence threshold of polygons. }"
+        "{ maxCandidate max                 |200| Max candidates of polygons. }"
+        "{ unclipRatio ratio                |2.0| unclip ratio. }"
+        "{ evaluate e                       |false| false: predict with input images; true: evaluate on benchmarks. }"
+        "{ evalDataPath edp                  | | Path to benchmarks for evaluation. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
+
+static
+void split(const std::string& s, char delimiter, std::vector<std::string>& elems)
+{
+    elems.clear();
+    size_t prev_pos = 0;
+    size_t pos = 0;
+    while ((pos = s.find(delimiter, prev_pos)) != std::string::npos)
+    {
+        elems.emplace_back(s.substr(prev_pos, pos - prev_pos));
+        prev_pos = pos + 1;
+    }
+    if (prev_pos < s.size())
+        elems.emplace_back(s.substr(prev_pos, s.size() - prev_pos));
+}
+
+int main(int argc, char** argv)
+{
+    // Parse arguments
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Use this script to run the official PyTorch implementation (https://github.com/MhLiao/DB) of "
+                 "Real-time Scene Text Detection with Differentiable Binarization (https://arxiv.org/abs/1911.08947)\n"
+                 "The current version of this script is a variant of the original network without deformable convolution");
+    if (argc == 1 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    float binThresh = parser.get<float>("binaryThreshold");
+    float polyThresh = parser.get<float>("polygonThreshold");
+    uint maxCandidates = parser.get<uint>("maxCandidate");
+    String modelPath = parser.get<String>("modelPath");
+    double unclipRatio = parser.get<double>("unclipRatio");
+    int height = parser.get<int>("inputHeight");
+    int width = parser.get<int>("inputWidth");
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    // Load the network
+    CV_Assert(!modelPath.empty());
+    TextDetectionModel_DB detector(modelPath);
+    detector.setBinaryThreshold(binThresh)
+            .setPolygonThreshold(polyThresh)
+            .setUnclipRatio(unclipRatio)
+            .setMaxCandidates(maxCandidates);
+
+    double scale = 1.0 / 255.0;
+    Size inputSize = Size(width, height);
+    Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
+    detector.setInputParams(scale, inputSize, mean);
+
+    // Create a window
+    static const std::string winName = "TextDetectionModel";
+
+    if (parser.get<bool>("evaluate")) {
+        // for evaluation
+        String evalDataPath = parser.get<String>("evalDataPath");
+        CV_Assert(!evalDataPath.empty());
+        String testListPath = evalDataPath + "/test_list.txt";
+        std::ifstream testList;
+        testList.open(testListPath);
+        CV_Assert(testList.is_open());
+
+        // Create a window for showing groundtruth
+        static const std::string winNameGT = "GT";
+
+        String testImgPath;
+        while (std::getline(testList, testImgPath)) {
+            String imgPath = evalDataPath + "/test_images/" + testImgPath;
+            std::cout << "Image Path: " << imgPath << std::endl;
+
+            Mat frame = imread(samples::findFile(imgPath), IMREAD_COLOR);
+            CV_Assert(!frame.empty());
+            Mat src = frame.clone();
+
+            // Inference
+            std::vector<std::vector<Point>> results;
+            detector.detect(frame, results);
+
+            polylines(frame, results, true, Scalar(0, 255, 0), 2);
+            imshow(winName, frame);
+
+            // load groundtruth
+            String imgName = testImgPath.substr(0, testImgPath.length() - 4);
+            String gtPath = evalDataPath + "/test_gts/" + imgName + ".txt";
+            // std::cout << gtPath << std::endl;
+            std::ifstream gtFile;
+            gtFile.open(gtPath);
+            CV_Assert(gtFile.is_open());
+
+            std::vector<std::vector<Point>> gts;
+            String gtLine;
+            while (std::getline(gtFile, gtLine)) {
+                size_t splitLoc = gtLine.find_last_of(',');
+                String text = gtLine.substr(splitLoc+1);
+                if ( text == "###\r" || text == "1") {
+                    // ignore difficult instances
+                    continue;
+                }
+                gtLine = gtLine.substr(0, splitLoc);
+
+                std::vector<std::string> v;
+                split(gtLine, ',', v);
+
+                std::vector<int> loc;
+                std::vector<Point> pts;
+                for (auto && s : v) {
+                    loc.push_back(atoi(s.c_str()));
+                }
+                for (size_t i = 0; i < loc.size() / 2; i++) {
+                    pts.push_back(Point(loc[2 * i], loc[2 * i + 1]));
+                }
+                gts.push_back(pts);
+            }
+            polylines(src, gts, true, Scalar(0, 255, 0), 2);
+            imshow(winNameGT, src);
+
+            waitKey();
+        }
+    } else {
+        // Open an image file
+        CV_Assert(parser.has("inputImage"));
+        Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
+        CV_Assert(!frame.empty());
+
+        // Detect
+        std::vector<std::vector<Point>> results;
+        detector.detect(frame, results);
+
+        polylines(frame, results, true, Scalar(0, 255, 0), 2);
+        imshow(winName, frame);
+        waitKey();
+    }
+
+    return 0;
+}
diff --git a/samples/dnn/scene_text_recognition.cpp b/samples/dnn/scene_text_recognition.cpp
new file mode 100644
index 0000000000..29b14441dd
--- /dev/null
+++ b/samples/dnn/scene_text_recognition.cpp
@@ -0,0 +1,144 @@
+#include <iostream>
+#include <fstream>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn/dnn.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+String keys =
+        "{ help  h                          | | Print help message. }"
+        "{ inputImage i                     | | Path to an input image. Skip this argument to capture frames from a camera. }"
+        "{ modelPath mp                     | | Path to a binary .onnx file contains trained CRNN text recognition model. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+        "{ RGBInput rgb                     |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
+        "{ evaluate e                       |false| false: predict with input images; true: evaluate on benchmarks. }"
+        "{ evalDataPath edp                 | | Path to benchmarks for evaluation. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+        "{ vocabularyPath vp                | alphabet_36.txt | Path to recognition vocabulary. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
+
+String convertForEval(String &input);
+
+int main(int argc, char** argv)
+{
+    // Parse arguments
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Use this script to run the PyTorch implementation of "
+                 "An End-to-End Trainable Neural Network for Image-based SequenceRecognition and Its Application to Scene Text Recognition "
+                 "(https://arxiv.org/abs/1507.05717)");
+    if (argc == 1 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    String modelPath = parser.get<String>("modelPath");
+    String vocPath = parser.get<String>("vocabularyPath");
+    int imreadRGB = parser.get<int>("RGBInput");
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    // Load the network
+    CV_Assert(!modelPath.empty());
+    TextRecognitionModel recognizer(modelPath);
+
+    // Load vocabulary
+    CV_Assert(!vocPath.empty());
+    std::ifstream vocFile;
+    vocFile.open(samples::findFile(vocPath));
+    CV_Assert(vocFile.is_open());
+    String vocLine;
+    std::vector<String> vocabulary;
+    while (std::getline(vocFile, vocLine)) {
+        vocabulary.push_back(vocLine);
+    }
+    recognizer.setVocabulary(vocabulary);
+    recognizer.setDecodeType("CTC-greedy");
+
+    // Set parameters
+    double scale = 1.0 / 127.5;
+    Scalar mean = Scalar(127.5, 127.5, 127.5);
+    Size inputSize = Size(100, 32);
+    recognizer.setInputParams(scale, inputSize, mean);
+
+    if (parser.get<bool>("evaluate"))
+    {
+        // For evaluation
+        String evalDataPath = parser.get<String>("evalDataPath");
+        CV_Assert(!evalDataPath.empty());
+        String gtPath = evalDataPath + "/test_gts.txt";
+        std::ifstream evalGts;
+        evalGts.open(gtPath);
+        CV_Assert(evalGts.is_open());
+
+        String gtLine;
+        int cntRight=0, cntAll=0;
+        TickMeter timer;
+        timer.reset();
+
+        while (std::getline(evalGts, gtLine)) {
+            size_t splitLoc = gtLine.find_first_of(' ');
+            String imgPath = evalDataPath + '/' + gtLine.substr(0, splitLoc);
+            String gt = gtLine.substr(splitLoc+1);
+
+            // Inference
+            Mat frame = imread(samples::findFile(imgPath), imreadRGB);
+            CV_Assert(!frame.empty());
+            timer.start();
+            std::string recognitionResult = recognizer.recognize(frame);
+            timer.stop();
+
+            if (gt == convertForEval(recognitionResult))
+                cntRight++;
+
+            cntAll++;
+        }
+        std::cout << "Accuracy(%): " << (double)(cntRight) / (double)(cntAll) << std::endl;
+        std::cout << "Average Inference Time(ms): " << timer.getTimeMilli() / (double)(cntAll) << std::endl;
+    }
+    else
+    {
+        // Create a window
+        static const std::string winName = "Input Cropped Image";
+
+        // Open an image file
+        CV_Assert(parser.has("inputImage"));
+        Mat frame = imread(samples::findFile(parser.get<String>("inputImage")), imreadRGB);
+        CV_Assert(!frame.empty());
+
+        // Recognition
+        std::string recognitionResult = recognizer.recognize(frame);
+
+        imshow(winName, frame);
+        std::cout << "Predition: '" << recognitionResult << "'" << std::endl;
+        waitKey();
+    }
+
+    return 0;
+}
+
+// Convert the predictions to lower case, and remove other characters.
+// Only for Evaluation
+String convertForEval(String & input)
+{
+    String output;
+    for (uint i = 0; i < input.length(); i++){
+        char ch = input[i];
+        if ((int)ch >= 97 && (int)ch <= 122) {
+            output.push_back(ch);
+        } else if ((int)ch >= 65 && (int)ch <= 90) {
+            output.push_back((char)(ch + 32));
+        } else {
+            continue;
+        }
+    }
+
+    return output;
+}
diff --git a/samples/dnn/scene_text_spotting.cpp b/samples/dnn/scene_text_spotting.cpp
new file mode 100644
index 0000000000..548289d0e9
--- /dev/null
+++ b/samples/dnn/scene_text_spotting.cpp
@@ -0,0 +1,169 @@
+#include <iostream>
+#include <fstream>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn/dnn.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+std::string keys =
+        "{ help  h                          | | Print help message. }"
+        "{ inputImage i                     | | Path to an input image. Skip this argument to capture frames from a camera. }"
+        "{ detModelPath dmp                 | | Path to a binary .onnx model for detection. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+        "{ recModelPath rmp                 | | Path to a binary .onnx model for recognition. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+        "{ inputHeight ih                   |736| image height of the model input. It should be multiple by 32.}"
+        "{ inputWidth iw                    |736| image width of the model input. It should be multiple by 32.}"
+        "{ RGBInput rgb                     |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
+        "{ binaryThreshold bt               |0.3| Confidence threshold of the binary map. }"
+        "{ polygonThreshold pt              |0.5| Confidence threshold of polygons. }"
+        "{ maxCandidate max                 |200| Max candidates of polygons. }"
+        "{ unclipRatio ratio                |2.0| unclip ratio. }"
+        "{ vocabularyPath vp                | alphabet_36.txt | Path to benchmarks for evaluation. "
+            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
+
+void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
+bool sortPts(const Point& p1, const Point& p2);
+
+int main(int argc, char** argv)
+{
+    // Parse arguments
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"
+                 "Use -h for more information");
+    if (argc == 1 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    float binThresh = parser.get<float>("binaryThreshold");
+    float polyThresh = parser.get<float>("polygonThreshold");
+    uint maxCandidates = parser.get<uint>("maxCandidate");
+    String detModelPath = parser.get<String>("detModelPath");
+    String recModelPath = parser.get<String>("recModelPath");
+    String vocPath = parser.get<String>("vocabularyPath");
+    double unclipRatio = parser.get<double>("unclipRatio");
+    int height = parser.get<int>("inputHeight");
+    int width = parser.get<int>("inputWidth");
+    int imreadRGB = parser.get<int>("RGBInput");
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    // Load networks
+    CV_Assert(!detModelPath.empty());
+    TextDetectionModel_DB detector(detModelPath);
+    detector.setBinaryThreshold(binThresh)
+            .setPolygonThreshold(polyThresh)
+            .setUnclipRatio(unclipRatio)
+            .setMaxCandidates(maxCandidates);
+
+    CV_Assert(!recModelPath.empty());
+    TextRecognitionModel recognizer(recModelPath);
+
+    // Load vocabulary
+    CV_Assert(!vocPath.empty());
+    std::ifstream vocFile;
+    vocFile.open(samples::findFile(vocPath));
+    CV_Assert(vocFile.is_open());
+    String vocLine;
+    std::vector<String> vocabulary;
+    while (std::getline(vocFile, vocLine)) {
+        vocabulary.push_back(vocLine);
+    }
+    recognizer.setVocabulary(vocabulary);
+    recognizer.setDecodeType("CTC-greedy");
+
+    // Parameters for Detection
+    double detScale = 1.0 / 255.0;
+    Size detInputSize = Size(width, height);
+    Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
+    detector.setInputParams(detScale, detInputSize, detMean);
+
+    // Parameters for Recognition
+    double recScale = 1.0 / 127.5;
+    Scalar recMean = Scalar(127.5);
+    Size recInputSize = Size(100, 32);
+    recognizer.setInputParams(recScale, recInputSize, recMean);
+
+    // Create a window
+    static const std::string winName = "Text_Spotting";
+
+    // Input data
+    Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
+    std::cout << frame.size << std::endl;
+
+    // Inference
+    std::vector< std::vector<Point> > detResults;
+    detector.detect(frame, detResults);
+
+    if (detResults.size() > 0) {
+        // Text Recognition
+        Mat recInput;
+        if (!imreadRGB) {
+            cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
+        } else {
+            recInput = frame;
+        }
+        std::vector< std::vector<Point> > contours;
+        for (uint i = 0; i < detResults.size(); i++)
+        {
+            const auto& quadrangle = detResults[i];
+            CV_CheckEQ(quadrangle.size(), (size_t)4, "");
+
+            contours.emplace_back(quadrangle);
+
+            std::vector<Point2f> quadrangle_2f;
+            for (int j = 0; j < 4; j++)
+                quadrangle_2f.emplace_back(quadrangle[j]);
+
+            // Transform and Crop
+            Mat cropped;
+            fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
+
+            std::string recognitionResult = recognizer.recognize(cropped);
+            std::cout << i << ": '" << recognitionResult << "'" << std::endl;
+
+            putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
+        }
+        polylines(frame, contours, true, Scalar(0, 255, 0), 2);
+    } else {
+        std::cout << "No Text Detected." << std::endl;
+    }
+    imshow(winName, frame);
+    waitKey();
+
+    return 0;
+}
+
+void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
+{
+    const Size outputSize = Size(100, 32);
+
+    Point2f targetVertices[4] = {
+        Point(0, outputSize.height - 1),
+        Point(0, 0),
+        Point(outputSize.width - 1, 0),
+        Point(outputSize.width - 1, outputSize.height - 1)
+    };
+    Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
+
+    warpPerspective(frame, result, rotationMatrix, outputSize);
+
+#if 0
+    imshow("roi", result);
+    waitKey();
+#endif
+}
+
+bool sortPts(const Point& p1, const Point& p2)
+{
+    return p1.x < p2.x;
+}
diff --git a/samples/dnn/text_detection.cpp b/samples/dnn/text_detection.cpp
index e1314a7de2..76989dcdc2 100644
--- a/samples/dnn/text_detection.cpp
+++ b/samples/dnn/text_detection.cpp
@@ -2,22 +2,23 @@
     Text detection model: https://github.com/argman/EAST
     Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
 
-    CRNN Text recognition model taken from here: https://github.com/meijieru/crnn.pytorch
+    Text recognition models can be downloaded directly here:
+    Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
+    and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
+
     How to convert from pb to onnx:
     Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
-
-    More converted onnx text recognition models can be downloaded directly here:
-    Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
-    And these models taken from here:https://github.com/clovaai/deep-text-recognition-benchmark
-
     import torch
     from models.crnn import CRNN
-
     model = CRNN(32, 1, 37, 256)
     model.load_state_dict(torch.load('crnn.pth'))
     dummy_input = torch.randn(1, 1, 32, 100)
     torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
+
+    For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
 */
+#include <iostream>
+#include <fstream>
 
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
@@ -27,21 +28,20 @@ using namespace cv;
 using namespace cv::dnn;
 
 const char* keys =
-    "{ help  h     | | Print help message. }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
-    "{ model m     | | Path to a binary .pb file contains trained detector network.}"
-    "{ ocr         | | Path to a binary .pb or .onnx file contains trained recognition network.}"
-    "{ width       | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
-    "{ height      | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
-    "{ thr         | 0.5 | Confidence threshold. }"
-    "{ nms         | 0.4 | Non-maximum suppression threshold. }";
+    "{ help  h              | | Print help message. }"
+    "{ input i              | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ detModel dmp         | | Path to a binary .pb file contains trained detector network.}"
+    "{ width                | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
+    "{ height               | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
+    "{ thr                  | 0.5 | Confidence threshold. }"
+    "{ nms                  | 0.4 | Non-maximum suppression threshold. }"
+    "{ recModel rmp         | | Path to a binary .onnx file contains trained CRNN text recognition model. "
+        "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
+    "{ RGBInput rgb         |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
+    "{ vocabularyPath vp    | alphabet_36.txt | Path to benchmarks for evaluation. "
+        "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
 
-void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
-                         std::vector<RotatedRect>& detections, std::vector<float>& confidences);
-
-void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result);
-
-void decodeText(const Mat& scores, std::string& text);
+void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
 
 int main(int argc, char** argv)
 {
@@ -57,10 +57,12 @@ int main(int argc, char** argv)
 
     float confThreshold = parser.get<float>("thr");
     float nmsThreshold = parser.get<float>("nms");
-    int inpWidth = parser.get<int>("width");
-    int inpHeight = parser.get<int>("height");
-    String modelDecoder = parser.get<String>("model");
-    String modelRecognition = parser.get<String>("ocr");
+    int width = parser.get<int>("width");
+    int height = parser.get<int>("height");
+    int imreadRGB = parser.get<int>("RGBInput");
+    String detModelPath = parser.get<String>("detModel");
+    String recModelPath = parser.get<String>("recModel");
+    String vocPath = parser.get<String>("vocabularyPath");
 
     if (!parser.check())
     {
@@ -68,14 +70,39 @@ int main(int argc, char** argv)
         return 1;
     }
 
-    CV_Assert(!modelDecoder.empty());
-
     // Load networks.
-    Net detector = readNet(modelDecoder);
-    Net recognizer;
+    CV_Assert(!detModelPath.empty() && !recModelPath.empty());
+    TextDetectionModel_EAST detector(detModelPath);
+    detector.setConfidenceThreshold(confThreshold)
+            .setNMSThreshold(nmsThreshold);
 
-    if (!modelRecognition.empty())
-        recognizer = readNet(modelRecognition);
+    TextRecognitionModel recognizer(recModelPath);
+
+    // Load vocabulary
+    CV_Assert(!vocPath.empty());
+    std::ifstream vocFile;
+    vocFile.open(samples::findFile(vocPath));
+    CV_Assert(vocFile.is_open());
+    String vocLine;
+    std::vector<String> vocabulary;
+    while (std::getline(vocFile, vocLine)) {
+        vocabulary.push_back(vocLine);
+    }
+    recognizer.setVocabulary(vocabulary);
+    recognizer.setDecodeType("CTC-greedy");
+
+    // Parameters for Recognition
+    double recScale = 1.0 / 127.5;
+    Scalar recMean = Scalar(127.5, 127.5, 127.5);
+    Size recInputSize = Size(100, 32);
+    recognizer.setInputParams(recScale, recInputSize, recMean);
+
+    // Parameters for Detection
+    double detScale = 1.0;
+    Size detInputSize = Size(width, height);
+    Scalar detMean = Scalar(123.68, 116.78, 103.94);
+    bool swapRB = true;
+    detector.setInputParams(detScale, detInputSize, detMean, swapRB);
 
     // Open a video file or an image file or a camera stream.
     VideoCapture cap;
@@ -83,15 +110,8 @@ int main(int argc, char** argv)
     CV_Assert(openSuccess);
 
     static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
-    namedWindow(kWinName, WINDOW_NORMAL);
 
-    std::vector<Mat> outs;
-    std::vector<String> outNames(2);
-    outNames[0] = "feature_fusion/Conv_7/Sigmoid";
-    outNames[1] = "feature_fusion/concat_3";
-
-    Mat frame, blob;
-    TickMeter tickMeter;
+    Mat frame;
     while (waitKey(1) < 0)
     {
         cap >> frame;
@@ -101,162 +121,57 @@ int main(int argc, char** argv)
             break;
         }
 
-        blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false);
-        detector.setInput(blob);
-        tickMeter.start();
-        detector.forward(outs, outNames);
-        tickMeter.stop();
+        std::cout << frame.size << std::endl;
 
-        Mat scores = outs[0];
-        Mat geometry = outs[1];
+        // Detection
+        std::vector< std::vector<Point> > detResults;
+        detector.detect(frame, detResults);
 
-        // Decode predicted bounding boxes.
-        std::vector<RotatedRect> boxes;
-        std::vector<float> confidences;
-        decodeBoundingBoxes(scores, geometry, confThreshold, boxes, confidences);
-
-        // Apply non-maximum suppression procedure.
-        std::vector<int> indices;
-        NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
-
-        Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight);
-
-        // Render text.
-        for (size_t i = 0; i < indices.size(); ++i)
-        {
-            RotatedRect& box = boxes[indices[i]];
-
-            Point2f vertices[4];
-            box.points(vertices);
-
-            for (int j = 0; j < 4; ++j)
-            {
-                vertices[j].x *= ratio.x;
-                vertices[j].y *= ratio.y;
+        if (detResults.size() > 0) {
+            // Text Recognition
+            Mat recInput;
+            if (!imreadRGB) {
+                cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
+            } else {
+                recInput = frame;
             }
-
-            if (!modelRecognition.empty())
+            std::vector< std::vector<Point> > contours;
+            for (uint i = 0; i < detResults.size(); i++)
             {
+                const auto& quadrangle = detResults[i];
+                CV_CheckEQ(quadrangle.size(), (size_t)4, "");
+
+                contours.emplace_back(quadrangle);
+
+                std::vector<Point2f> quadrangle_2f;
+                for (int j = 0; j < 4; j++)
+                    quadrangle_2f.emplace_back(quadrangle[j]);
+
                 Mat cropped;
-                fourPointsTransform(frame, vertices, cropped);
+                fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
 
-                cvtColor(cropped, cropped, cv::COLOR_BGR2GRAY);
+                std::string recognitionResult = recognizer.recognize(cropped);
+                std::cout << i << ": '" << recognitionResult << "'" << std::endl;
 
-                Mat blobCrop = blobFromImage(cropped, 1.0/127.5, Size(), Scalar::all(127.5));
-                recognizer.setInput(blobCrop);
-
-                tickMeter.start();
-                Mat result = recognizer.forward();
-                tickMeter.stop();
-
-                std::string wordRecognized = "";
-                decodeText(result, wordRecognized);
-                putText(frame, wordRecognized, vertices[1], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255));
+                putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
             }
-
-            for (int j = 0; j < 4; ++j)
-                line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1);
+            polylines(frame, contours, true, Scalar(0, 255, 0), 2);
         }
-
-        // Put efficiency information.
-        std::string label = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
-        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
-
         imshow(kWinName, frame);
-
-        tickMeter.reset();
     }
     return 0;
 }
 
-void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
-                         std::vector<RotatedRect>& detections, std::vector<float>& confidences)
-{
-    detections.clear();
-    CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); CV_Assert(scores.size[0] == 1);
-    CV_Assert(geometry.size[0] == 1); CV_Assert(scores.size[1] == 1); CV_Assert(geometry.size[1] == 5);
-    CV_Assert(scores.size[2] == geometry.size[2]); CV_Assert(scores.size[3] == geometry.size[3]);
-
-    const int height = scores.size[2];
-    const int width = scores.size[3];
-    for (int y = 0; y < height; ++y)
-    {
-        const float* scoresData = scores.ptr<float>(0, 0, y);
-        const float* x0_data = geometry.ptr<float>(0, 0, y);
-        const float* x1_data = geometry.ptr<float>(0, 1, y);
-        const float* x2_data = geometry.ptr<float>(0, 2, y);
-        const float* x3_data = geometry.ptr<float>(0, 3, y);
-        const float* anglesData = geometry.ptr<float>(0, 4, y);
-        for (int x = 0; x < width; ++x)
-        {
-            float score = scoresData[x];
-            if (score < scoreThresh)
-                continue;
-
-            // Decode a prediction.
-            // Multiple by 4 because feature maps are 4 time less than input image.
-            float offsetX = x * 4.0f, offsetY = y * 4.0f;
-            float angle = anglesData[x];
-            float cosA = std::cos(angle);
-            float sinA = std::sin(angle);
-            float h = x0_data[x] + x2_data[x];
-            float w = x1_data[x] + x3_data[x];
-
-            Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x],
-                           offsetY - sinA * x1_data[x] + cosA * x2_data[x]);
-            Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset;
-            Point2f p3 = Point2f(-cosA * w, sinA * w) + offset;
-            RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI);
-            detections.push_back(r);
-            confidences.push_back(score);
-        }
-    }
-}
-
-void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result)
+void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
 {
     const Size outputSize = Size(100, 32);
 
-    Point2f targetVertices[4] = {Point(0, outputSize.height - 1),
-                                  Point(0, 0), Point(outputSize.width - 1, 0),
-                                  Point(outputSize.width - 1, outputSize.height - 1),
-                                  };
+    Point2f targetVertices[4] = {
+        Point(0, outputSize.height - 1),
+        Point(0, 0), Point(outputSize.width - 1, 0),
+        Point(outputSize.width - 1, outputSize.height - 1)
+    };
     Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
 
     warpPerspective(frame, result, rotationMatrix, outputSize);
 }
-
-void decodeText(const Mat& scores, std::string& text)
-{
-    static const std::string alphabet = "0123456789abcdefghijklmnopqrstuvwxyz";
-    Mat scoresMat = scores.reshape(1, scores.size[0]);
-
-    std::vector<char> elements;
-    elements.reserve(scores.size[0]);
-
-    for (int rowIndex = 0; rowIndex < scoresMat.rows; ++rowIndex)
-    {
-        Point p;
-        minMaxLoc(scoresMat.row(rowIndex), 0, 0, 0, &p);
-        if (p.x > 0 && static_cast<size_t>(p.x) <= alphabet.size())
-        {
-            elements.push_back(alphabet[p.x - 1]);
-        }
-        else
-        {
-            elements.push_back('-');
-        }
-    }
-
-    if (elements.size() > 0 && elements[0] != '-')
-        text += elements[0];
-
-    for (size_t elementIndex = 1; elementIndex < elements.size(); ++elementIndex)
-    {
-        if (elementIndex > 0 && elements[elementIndex] != '-' &&
-            elements[elementIndex - 1] != elements[elementIndex])
-        {
-            text += elements[elementIndex];
-        }
-    }
-}
\ No newline at end of file
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
index ea24898873..c82053b4fb 100644
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@@ -270,7 +270,11 @@ def removeIdentity(graph_def):
     identities = {}
     for node in graph_def.node:
         if node.op == 'Identity' or node.op == 'IdentityN':
-            identities[node.name] = node.input[0]
+            inp = node.input[0]
+            if inp in identities:
+                identities[node.name] = identities[inp]
+            else:
+                identities[node.name] = inp
             graph_def.node.remove(node)
 
     for node in graph_def.node:
diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py
index 46a9064738..dbdee6ea9f 100644
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@@ -122,7 +122,6 @@ def createSSDGraph(modelPath, configPath, outputPath):
     print('Input image size: %dx%d' % (image_width, image_height))
 
     # Read the graph.
-    _inpNames = ['image_tensor']
     outNames = ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes']
 
     writeTextGraph(modelPath, outputPath, outNames)
@@ -247,6 +246,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
     graph_def.node[1].input.append(graph_def.node[0].name)
     graph_def.node[1].input.append(weights)
 
+    # check and correct the case when preprocessing block is after input
+    preproc_id = "Preprocessor/"
+    if graph_def.node[2].name.startswith(preproc_id) and \
+        graph_def.node[2].input[0].startswith(preproc_id):
+
+        if not any(preproc_id in inp for inp in graph_def.node[3].input):
+            graph_def.node[3].input.insert(0, graph_def.node[2].name)
+
+
     # Create SSD postprocessing head ###############################################
 
     # Concatenate predictions of classes, predictions of bounding boxes and proposals.
diff --git a/samples/dnn/virtual_try_on.py b/samples/dnn/virtual_try_on.py
index d1cdd4e021..076cb21d5b 100644
--- a/samples/dnn/virtual_try_on.py
+++ b/samples/dnn/virtual_try_on.py
@@ -113,7 +113,7 @@ class BilinearFilter(object):
             out[yy] = np.round(np.sum(img[ymin : ymin + ymax, 0:out.shape[1]] * k[:, np.newaxis], axis=0))
 
     def imaging_resample(self, img, xsize, ysize):
-        height, width, *args = img.shape
+        height, width = img.shape[0:2]
         bounds_horiz, kk_horiz, ksize_horiz = self._precompute_coeffs(width, xsize)
         bounds_vert, kk_vert, ksize_vert    = self._precompute_coeffs(height, ysize)
 
@@ -233,7 +233,6 @@ class CpVton(object):
         return Li
 
     def _prepare_to_transform(self, out_h=256, out_w=192, grid_size=5):
-        grid = np.zeros([out_h, out_w, 3], dtype=np.float32)
         grid_X, grid_Y = np.meshgrid(np.linspace(-1, 1, out_w), np.linspace(-1, 1, out_h))
         grid_X = np.expand_dims(np.expand_dims(grid_X, axis=0), axis=3)
         grid_Y = np.expand_dims(np.expand_dims(grid_Y, axis=0), axis=3)
@@ -398,7 +397,7 @@ class CorrelationLayer(object):
 
     def getMemoryShapes(self, inputs):
         fetureAShape = inputs[0]
-        b, c, h, w = fetureAShape
+        b, _, h, w = fetureAShape
         return [[b, h * w, h, w]]
 
     def forward(self, inputs):
diff --git a/samples/python/hist.py b/samples/python/hist.py
index 4e73f0bc65..4c2c1ad395 100755
--- a/samples/python/hist.py
+++ b/samples/python/hist.py
@@ -76,7 +76,7 @@ def main():
     a - show histogram for color image in curve mode \n
     b - show histogram in bin mode \n
     c - show equalized histogram (always in bin mode) \n
-    d - show histogram for color image in curve mode \n
+    d - show histogram for gray image in curve mode \n
     e - show histogram for a normalized image in curve mode \n
     Esc - exit \n
     ''')
diff --git a/samples/python/houghcircles.py b/samples/python/houghcircles.py
index 416309aab0..60a36714fe 100755
--- a/samples/python/houghcircles.py
+++ b/samples/python/houghcircles.py
@@ -30,6 +30,7 @@ def main():
     circles = cv.HoughCircles(img, cv.HOUGH_GRADIENT, 1, 10, np.array([]), 100, 30, 1, 30)
 
     if circles is not None: # Check if circles have been found and only then iterate over these and add them to the image
+        circles = np.uint16(np.around(circles))
         _a, b, _c = circles.shape
         for i in range(b):
             cv.circle(cimg, (circles[0][i][0], circles[0][i][1]), circles[0][i][2], (0, 0, 255), 3, cv.LINE_AA)
diff --git a/samples/python/stitching_detailed.py b/samples/python/stitching_detailed.py
index cd3f063e35..7bf5a9ac0d 100644
--- a/samples/python/stitching_detailed.py
+++ b/samples/python/stitching_detailed.py
@@ -29,8 +29,9 @@ BA_COST_CHOICES['no'] = cv.detail_NoBundleAdjuster
 
 FEATURES_FIND_CHOICES = OrderedDict()
 try:
+    cv.xfeatures2d_SURF.create() # check if the function can be called
     FEATURES_FIND_CHOICES['surf'] = cv.xfeatures2d_SURF.create
-except AttributeError:
+except (AttributeError, cv.error) as e:
     print("SURF not available")
 # if SURF not available, ORB is default
 FEATURES_FIND_CHOICES['orb'] = cv.ORB.create
diff --git a/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo1.py b/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo1.py
index fc85aca339..8086b9327e 100644
--- a/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo1.py
+++ b/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo1.py
@@ -38,10 +38,10 @@ def Hist_and_Backproj(val):
 
 ## [Read the image]
 parser = argparse.ArgumentParser(description='Code for Back Projection tutorial.')
-parser.add_argument('--input', help='Path to input image.')
+parser.add_argument('--input', help='Path to input image.', default='home.jpg')
 args = parser.parse_args()
 
-src = cv.imread(args.input)
+src = cv.imread(cv.samples.findFile(args.input))
 if src is None:
     print('Could not open or find the image:', args.input)
     exit(0)
diff --git a/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo2.py b/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo2.py
index 24255a2bb8..42d5a9b81f 100644
--- a/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo2.py
+++ b/samples/python/tutorial_code/Histograms_Matching/back_projection/calcBackProject_Demo2.py
@@ -54,10 +54,10 @@ def Hist_and_Backproj(mask):
 
 # Read the image
 parser = argparse.ArgumentParser(description='Code for Back Projection tutorial.')
-parser.add_argument('--input', help='Path to input image.')
+parser.add_argument('--input', help='Path to input image.', default='home.jpg')
 args = parser.parse_args()
 
-src = cv.imread(args.input)
+src = cv.imread(cv.samples.findFile(args.input))
 if src is None:
     print('Could not open or find the image:', args.input)
     exit(0)
diff --git a/samples/python/tutorial_code/Histograms_Matching/histogram_calculation/calcHist_Demo.py b/samples/python/tutorial_code/Histograms_Matching/histogram_calculation/calcHist_Demo.py
index 992ca90141..1789d49358 100644
--- a/samples/python/tutorial_code/Histograms_Matching/histogram_calculation/calcHist_Demo.py
+++ b/samples/python/tutorial_code/Histograms_Matching/histogram_calculation/calcHist_Demo.py
@@ -53,14 +53,14 @@ cv.normalize(r_hist, r_hist, alpha=0, beta=hist_h, norm_type=cv.NORM_MINMAX)
 
 ## [Draw for each channel]
 for i in range(1, histSize):
-    cv.line(histImage, ( bin_w*(i-1), hist_h - int(round(b_hist[i-1])) ),
-            ( bin_w*(i), hist_h - int(round(b_hist[i])) ),
+    cv.line(histImage, ( bin_w*(i-1), hist_h - int(b_hist[i-1]) ),
+            ( bin_w*(i), hist_h - int(b_hist[i]) ),
             ( 255, 0, 0), thickness=2)
-    cv.line(histImage, ( bin_w*(i-1), hist_h - int(round(g_hist[i-1])) ),
-            ( bin_w*(i), hist_h - int(round(g_hist[i])) ),
+    cv.line(histImage, ( bin_w*(i-1), hist_h - int(g_hist[i-1]) ),
+            ( bin_w*(i), hist_h - int(g_hist[i]) ),
             ( 0, 255, 0), thickness=2)
-    cv.line(histImage, ( bin_w*(i-1), hist_h - int(round(r_hist[i-1])) ),
-            ( bin_w*(i), hist_h - int(round(r_hist[i])) ),
+    cv.line(histImage, ( bin_w*(i-1), hist_h - int(r_hist[i-1]) ),
+            ( bin_w*(i), hist_h - int(r_hist[i]) ),
             ( 0, 0, 255), thickness=2)
 ## [Draw for each channel]
 
diff --git a/samples/python/tutorial_code/ImgTrans/distance_transformation/imageSegmentation.py b/samples/python/tutorial_code/ImgTrans/distance_transformation/imageSegmentation.py
index 2388a414c0..e8add484a4 100644
--- a/samples/python/tutorial_code/ImgTrans/distance_transformation/imageSegmentation.py
+++ b/samples/python/tutorial_code/ImgTrans/distance_transformation/imageSegmentation.py
@@ -102,7 +102,8 @@ for i in range(len(contours)):
 
 # Draw the background marker
 cv.circle(markers, (5,5), 3, (255,255,255), -1)
-cv.imshow('Markers', markers*10000)
+markers_8u = (markers * 10).astype('uint8')
+cv.imshow('Markers', markers_8u)
 ## [seeds]
 
 ## [watershed]
diff --git a/samples/python/tutorial_code/TrackingMotion/corner_subpixels/cornerSubPix_Demo.py b/samples/python/tutorial_code/TrackingMotion/corner_subpixels/cornerSubPix_Demo.py
index 6d8738b310..37e38abd44 100644
--- a/samples/python/tutorial_code/TrackingMotion/corner_subpixels/cornerSubPix_Demo.py
+++ b/samples/python/tutorial_code/TrackingMotion/corner_subpixels/cornerSubPix_Demo.py
@@ -30,7 +30,7 @@ def goodFeaturesToTrack_Demo(val):
     print('** Number of corners detected:', corners.shape[0])
     radius = 4
     for i in range(corners.shape[0]):
-        cv.circle(copy, (corners[i,0,0], corners[i,0,1]), radius, (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)), cv.FILLED)
+        cv.circle(copy, (int(corners[i,0,0]), int(corners[i,0,1])), radius, (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)), cv.FILLED)
 
     # Show what you got
     cv.namedWindow(source_window)
diff --git a/samples/python/tutorial_code/TrackingMotion/good_features_to_track/goodFeaturesToTrack_Demo.py b/samples/python/tutorial_code/TrackingMotion/good_features_to_track/goodFeaturesToTrack_Demo.py
index 3fb8441d92..ef082bddc1 100644
--- a/samples/python/tutorial_code/TrackingMotion/good_features_to_track/goodFeaturesToTrack_Demo.py
+++ b/samples/python/tutorial_code/TrackingMotion/good_features_to_track/goodFeaturesToTrack_Demo.py
@@ -30,7 +30,7 @@ def goodFeaturesToTrack_Demo(val):
     print('** Number of corners detected:', corners.shape[0])
     radius = 4
     for i in range(corners.shape[0]):
-        cv.circle(copy, (corners[i,0,0], corners[i,0,1]), radius, (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)), cv.FILLED)
+        cv.circle(copy, (int(corners[i,0,0]), int(corners[i,0,1])), radius, (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)), cv.FILLED)
 
     # Show what you got
     cv.namedWindow(source_window)
diff --git a/samples/python/tutorial_code/dnn/dnn_conversion/common/test/voc_segm_test.py b/samples/python/tutorial_code/dnn/dnn_conversion/common/test/voc_segm_test.py
new file mode 100644
index 0000000000..4dbce158d6
--- /dev/null
+++ b/samples/python/tutorial_code/dnn/dnn_conversion/common/test/voc_segm_test.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+from ..accuracy_eval import SemSegmEvaluation
+from ..utils import plot_acc
+
+
+def test_segm_models(models_list, data_fetcher, eval_params, experiment_name, is_print_eval_params=True,
+                     is_plot_acc=True):
+    if is_print_eval_params:
+        print(
+            "===== Running evaluation of the classification models with the following params:\n"
+            "\t0. val data location: {}\n"
+            "\t1. val data labels: {}\n"
+            "\t2. frame size: {}\n"
+            "\t3. batch size: {}\n"
+            "\t4. transform to RGB: {}\n"
+            "\t5. log file location: {}\n".format(
+                eval_params.imgs_segm_dir,
+                eval_params.img_cls_file,
+                eval_params.frame_size,
+                eval_params.batch_size,
+                eval_params.bgr_to_rgb,
+                eval_params.log
+            )
+        )
+
+    accuracy_evaluator = SemSegmEvaluation(eval_params.log, eval_params.img_cls_file, eval_params.batch_size)
+    accuracy_evaluator.process(models_list, data_fetcher)
+    accuracy_array = np.array(accuracy_evaluator.general_fw_accuracy)
+
+    print(
+        "===== End of processing. Accuracy results:\n"
+        "\t1. max accuracy (top-5) for the original model: {}\n"
+        "\t2. max accuracy (top-5) for the DNN model: {}\n".format(
+            max(accuracy_array[:, 0]),
+            max(accuracy_array[:, 1]),
+        )
+    )
+
+    if is_plot_acc:
+        plot_acc(accuracy_array, experiment_name)
diff --git a/samples/python/tutorial_code/dnn/dnn_conversion/pytorch/segmentation/py_to_py_fcn_resnet50.py b/samples/python/tutorial_code/dnn/dnn_conversion/pytorch/segmentation/py_to_py_fcn_resnet50.py
new file mode 100644
index 0000000000..3649d30e1c
--- /dev/null
+++ b/samples/python/tutorial_code/dnn/dnn_conversion/pytorch/segmentation/py_to_py_fcn_resnet50.py
@@ -0,0 +1,59 @@
+from torchvision import models
+
+from ..pytorch_model import (
+    PyTorchModelPreparer,
+    PyTorchModelProcessor,
+    PyTorchDnnModelProcessor
+)
+from ...common.utils import set_pytorch_env, create_parser
+
+
+class PyTorchFcnResNet50(PyTorchModelPreparer):
+    def __init__(self, model_name, original_model):
+        super(PyTorchFcnResNet50, self).__init__(model_name, original_model)
+
+
+def main():
+    parser = create_parser()
+    cmd_args = parser.parse_args()
+    set_pytorch_env()
+
+    # Test the base process of model retrieval
+    resnets = PyTorchFcnResNet50(
+        model_name="resnet50",
+        original_model=models.segmentation.fcn_resnet50(pretrained=True)
+    )
+    model_dict = resnets.get_prepared_models()
+
+    if cmd_args.is_evaluate:
+        from ...common.test_config import TestConfig
+        from ...common.accuracy_eval import PASCALDataFetch
+        from ...common.test.voc_segm_test import test_segm_models
+
+        eval_params = TestConfig()
+
+        model_names = list(model_dict.keys())
+        original_model_name = model_names[0]
+        dnn_model_name = model_names[1]
+
+        #img_dir, segm_dir, names_file, segm_cls_colors_file, preproc)
+        data_fetcher = PASCALDataFetch(
+            imgs_dir=eval_params.imgs_segm_dir,
+            frame_size=eval_params.frame_size,
+            bgr_to_rgb=eval_params.bgr_to_rgb,
+
+        )
+
+        test_segm_models(
+            [
+                PyTorchModelProcessor(model_dict[original_model_name], original_model_name),
+                PyTorchDnnModelProcessor(model_dict[dnn_model_name], dnn_model_name)
+            ],
+            data_fetcher,
+            eval_params,
+            original_model_name
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py b/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
index 1a5f202420..eeb246bc38 100644
--- a/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
+++ b/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
@@ -53,7 +53,7 @@ thickness = 2
 sv = svm.getUncompressedSupportVectors()
 
 for i in range(sv.shape[0]):
-    cv.circle(image, (sv[i,0], sv[i,1]), 6, (128, 128, 128), thickness)
+    cv.circle(image, (int(sv[i,0]), int(sv[i,1])), 6, (128, 128, 128), thickness)
 ## [show_vectors]
 
 cv.imwrite('result.png', image) # save the image
diff --git a/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py b/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
index fc4b56c454..a88ac4bd1b 100644
--- a/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
+++ b/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
@@ -94,13 +94,13 @@ thick = -1
 for i in range(NTRAINING_SAMPLES):
     px = trainData[i,0]
     py = trainData[i,1]
-    cv.circle(I, (px, py), 3, (0, 255, 0), thick)
+    cv.circle(I, (int(px), int(py)), 3, (0, 255, 0), thick)
 
 # Class 2
 for i in range(NTRAINING_SAMPLES, 2*NTRAINING_SAMPLES):
     px = trainData[i,0]
     py = trainData[i,1]
-    cv.circle(I, (px, py), 3, (255, 0, 0), thick)
+    cv.circle(I, (int(px), int(py)), 3, (255, 0, 0), thick)
 ## [show_data]
 
 #------------------------- 6. Show support vectors --------------------------------------------
@@ -109,7 +109,7 @@ thick = 2
 sv = svm.getUncompressedSupportVectors()
 
 for i in range(sv.shape[0]):
-    cv.circle(I, (sv[i,0], sv[i,1]), 6, (128, 128, 128), thick)
+    cv.circle(I, (int(sv[i,0]), int(sv[i,1])), 6, (128, 128, 128), thick)
 ## [show_vectors]
 
 cv.imwrite('result.png', I)                      # save the Image
diff --git a/samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py b/samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py
index 79333a4f94..898c7dc4d7 100755
--- a/samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py
+++ b/samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py
@@ -33,7 +33,7 @@ def hog(img):
     return hist
 ## [hog]
 
-img = cv.imread('digits.png',0)
+img = cv.imread(cv.samples.findFile('digits.png'),0)
 if img is None:
     raise Exception("we need the digits.png image from samples/data here !")
 
diff --git a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
index 15330fc8b0..1bf3d2fdd8 100644
--- a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
+++ b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
@@ -18,7 +18,7 @@ else:
 
 ## [capture]
 capture = cv.VideoCapture(cv.samples.findFileOrKeep(args.input))
-if not capture.isOpened:
+if not capture.isOpened():
     print('Unable to open: ' + args.input)
     exit(0)
 ## [capture]
diff --git a/samples/python/tutorial_code/video/optical_flow/optical_flow.py b/samples/python/tutorial_code/video/optical_flow/optical_flow.py
index c367407e45..93bb2c421e 100644
--- a/samples/python/tutorial_code/video/optical_flow/optical_flow.py
+++ b/samples/python/tutorial_code/video/optical_flow/optical_flow.py
@@ -40,15 +40,16 @@ while(1):
     p1, st, err = cv.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params)
 
     # Select good points
-    good_new = p1[st==1]
-    good_old = p0[st==1]
+    if p1 is not None:
+        good_new = p1[st==1]
+        good_old = p0[st==1]
 
     # draw the tracks
     for i,(new,old) in enumerate(zip(good_new, good_old)):
         a,b = new.ravel()
         c,d = old.ravel()
-        mask = cv.line(mask, (a,b),(c,d), color[i].tolist(), 2)
-        frame = cv.circle(frame,(a,b),5,color[i].tolist(),-1)
+        mask = cv.line(mask, (int(a),int(b)),(int(c),int(d)), color[i].tolist(), 2)
+        frame = cv.circle(frame,(int(a),int(b)),5,color[i].tolist(),-1)
     img = cv.add(frame,mask)
 
     cv.imshow('frame',img)
diff --git a/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
index 84610d4768..973854feb6 100644
--- a/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
+++ b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
@@ -86,8 +86,8 @@ def main():
 
     framenum = -1 # Frame counter
 
-    captRefrnc = cv.VideoCapture(sourceReference)
-    captUndTst = cv.VideoCapture(sourceCompareWith)
+    captRefrnc = cv.VideoCapture(cv.samples.findFileOrKeep(sourceReference))
+    captUndTst = cv.VideoCapture(cv.samples.findFileOrKeep(sourceCompareWith))
 
     if not captRefrnc.isOpened():
         print("Could not open the reference " + sourceReference)
diff --git a/samples/samples_utils.cmake b/samples/samples_utils.cmake
index 9459099200..2780b479fc 100644
--- a/samples/samples_utils.cmake
+++ b/samples/samples_utils.cmake
@@ -4,6 +4,9 @@
 function(ocv_define_sample out_target source sub)
   get_filename_component(name "${source}" NAME_WE)
   set(the_target "example_${sub}_${name}")
+  if(OPENCV_DUMP_EXAMPLE_TARGET)
+    message(STATUS "Example: ${the_target}    (${source})")
+  endif()
   add_executable(${the_target} "${source}")
   if(TARGET Threads::Threads AND NOT OPENCV_EXAMPLES_DISABLE_THREADS)
     target_link_libraries(${the_target} PRIVATE Threads::Threads)
diff --git a/samples/tapi/video_acceleration.cpp b/samples/tapi/video_acceleration.cpp
new file mode 100644
index 0000000000..2c997710b3
--- /dev/null
+++ b/samples/tapi/video_acceleration.cpp
@@ -0,0 +1,211 @@
+#include <iostream>
+#include <chrono>
+#include "opencv2/core.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+
+const char* keys =
+"{ i input    |        | input video file }"
+"{ o output   |        | output video file, or specify 'null' to measure decoding without rendering to screen}"
+"{ backend    | any    | VideoCapture and VideoWriter backend, valid values: 'any', 'ffmpeg', 'msmf', 'gstreamer' }"
+"{ accel      | any    | GPU Video Acceleration, valid values: 'none', 'any', 'd3d11', 'vaapi', 'mfx' }"
+"{ device     | -1     | Video Acceleration device (GPU) index (-1 means default device) }"
+"{ out_w      |        | output width (resize by calling cv::resize) }"
+"{ out_h      |        | output height (resize by calling cv::resize) }"
+"{ bitwise_not| false  | apply simple image processing - bitwise_not pixels by calling cv::bitwise_not }"
+"{ opencl     | true   | use OpenCL (inside VideoCapture/VideoWriter and for image processing) }"
+"{ codec      | H264   | codec id (four characters string) of output file encoder }"
+"{ h help     |        | print help message }";
+
+struct {
+    cv::VideoCaptureAPIs backend;
+    const char* str;
+} backend_strings[] = {
+    { cv::CAP_ANY, "any" },
+    { cv::CAP_FFMPEG, "ffmpeg" },
+    { cv::CAP_MSMF, "msmf" },
+    { cv::CAP_GSTREAMER, "gstreamer" },
+};
+
+struct {
+    VideoAccelerationType acceleration;
+    const char* str;
+} acceleration_strings[] = {
+    { VIDEO_ACCELERATION_NONE, "none" },
+    { VIDEO_ACCELERATION_ANY, "any" },
+    { VIDEO_ACCELERATION_D3D11, "d3d11" },
+    { VIDEO_ACCELERATION_VAAPI, "vaapi" },
+    { VIDEO_ACCELERATION_MFX, "mfx" },
+};
+
+class FPSCounter {
+public:
+    FPSCounter(double _interval) : interval(_interval) {
+    }
+
+    ~FPSCounter() {
+        NewFrame(true);
+    }
+
+    void NewFrame(bool last_frame = false) {
+        num_frames++;
+        auto now = std::chrono::high_resolution_clock::now();
+        if (!last_time.time_since_epoch().count()) {
+            last_time = now;
+        }
+
+        double sec = std::chrono::duration_cast<std::chrono::duration<double>>(now - last_time).count();
+        if (sec >= interval || last_frame) {
+            printf("FPS(last %.2f sec) = %.2f\n", sec, num_frames / sec);
+            fflush(stdout);
+            num_frames = 0;
+            last_time = now;
+        }
+    }
+
+private:
+    double interval = 1;
+    std::chrono::time_point<std::chrono::high_resolution_clock> last_time;
+    int num_frames = 0;
+};
+
+int main(int argc, char** argv)
+{
+    cv::CommandLineParser cmd(argc, argv, keys);
+    if (cmd.has("help"))
+    {
+        cout << "Usage : video_acceleration [options]" << endl;
+        cout << "Available options:" << endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+
+    string infile = cmd.get<string>("i");
+    string outfile = cmd.get<string>("o");
+    string codec = cmd.get<string>("codec");
+    int device = cmd.get<int>("device");
+    int out_w = cmd.get<int>("out_w");
+    int out_h = cmd.get<int>("out_h");
+    bool use_opencl = cmd.get<bool>("opencl");
+    bool bitwise_not = cmd.get<bool>("bitwise_not");
+
+    cv::VideoCaptureAPIs backend = cv::CAP_ANY;
+    string backend_str = cmd.get<string>("backend");
+    for (size_t i = 0; i < sizeof(backend_strings)/sizeof(backend_strings[0]); i++) {
+        if (backend_str == backend_strings[i].str) {
+            backend = backend_strings[i].backend;
+            break;
+        }
+    }
+
+    VideoAccelerationType accel = VIDEO_ACCELERATION_ANY;
+    string accel_str = cmd.get<string>("accel");
+    for (size_t i = 0; i < sizeof(acceleration_strings) / sizeof(acceleration_strings[0]); i++) {
+        if (accel_str == acceleration_strings[i].str) {
+            accel = acceleration_strings[i].acceleration;
+            break;
+        }
+    }
+
+    ocl::setUseOpenCL(use_opencl);
+
+    VideoCapture capture(infile, backend, {
+            CAP_PROP_HW_ACCELERATION, (int)accel,
+            CAP_PROP_HW_DEVICE, device
+    });
+    if (!capture.isOpened()) {
+        cerr << "Failed to open VideoCapture" << endl;
+        return 1;
+    }
+    cout << "VideoCapture backend = " << capture.getBackendName() << endl;
+    VideoAccelerationType actual_accel = static_cast<VideoAccelerationType>(static_cast<int>(capture.get(CAP_PROP_HW_ACCELERATION)));
+    for (size_t i = 0; i < sizeof(acceleration_strings) / sizeof(acceleration_strings[0]); i++) {
+        if (actual_accel == acceleration_strings[i].acceleration) {
+            cout << "VideoCapture acceleration = " << acceleration_strings[i].str << endl;
+            cout << "VideoCapture acceleration device = " << (int)capture.get(CAP_PROP_HW_DEVICE) << endl;
+            break;
+        }
+    }
+
+    VideoWriter writer;
+    if (!outfile.empty() && outfile != "null") {
+        const char* codec_str = codec.c_str();
+        int fourcc = VideoWriter::fourcc(codec_str[0], codec_str[1], codec_str[2], codec_str[3]);
+        double fps = capture.get(CAP_PROP_FPS);
+        Size frameSize = { out_w, out_h };
+        if (!out_w || !out_h) {
+            frameSize = { (int)capture.get(CAP_PROP_FRAME_WIDTH), (int)capture.get(CAP_PROP_FRAME_HEIGHT) };
+        }
+        writer = VideoWriter(outfile, backend, fourcc, fps, frameSize, {
+                VIDEOWRITER_PROP_HW_ACCELERATION, (int)accel,
+                VIDEOWRITER_PROP_HW_DEVICE, device
+        });
+        if (!writer.isOpened()) {
+            cerr << "Failed to open VideoWriter" << endl;
+            return 1;
+        }
+        cout << "VideoWriter backend = " << writer.getBackendName() << endl;
+        actual_accel = static_cast<VideoAccelerationType>(static_cast<int>(writer.get(CAP_PROP_HW_ACCELERATION)));
+        for (size_t i = 0; i < sizeof(acceleration_strings) / sizeof(acceleration_strings[0]); i++) {
+            if (actual_accel == acceleration_strings[i].acceleration) {
+                cout << "VideoWriter acceleration = " << acceleration_strings[i].str << endl;
+                cout << "VideoWriter acceleration device = " << (int)writer.get(CAP_PROP_HW_DEVICE) << endl;
+                break;
+            }
+        }
+    }
+
+    cout << "\nStarting frame loop. Press ESC to exit\n";
+
+    FPSCounter fps_counter(0.5); // print FPS every 0.5 seconds
+
+    UMat frame, frame2, frame3;
+
+    for (;;)
+    {
+        capture.read(frame);
+        if (frame.empty()) {
+            cout << "End of stream" << endl;
+            break;
+        }
+
+        if (out_w && out_h) {
+            cv::resize(frame, frame2, cv::Size(out_w, out_h));
+            //cv::cvtColor(frame, outframe, COLOR_BGRA2RGBA);
+        }
+        else {
+            frame2 = frame;
+        }
+
+        if (bitwise_not) {
+            cv::bitwise_not(frame2, frame3);
+        }
+        else {
+            frame3 = frame2;
+        }
+
+        if (writer.isOpened()) {
+            writer.write(frame3);
+        }
+
+        if (outfile.empty()) {
+            imshow("output", frame3);
+            char key = (char) waitKey(1);
+            if (key == 27)
+                break;
+            else if (key == 'm') {
+                ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+                cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL enabled" : "CPU") << " mode\n";
+            }
+        }
+        fps_counter.NewFrame();
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/va_intel/display.cpp.inc b/samples/va_intel/display.cpp.inc
index 6fd3e0cbf0..7570cfe653 100644
--- a/samples/va_intel/display.cpp.inc
+++ b/samples/va_intel/display.cpp.inc
@@ -10,6 +10,8 @@
 #include <va/va.h>
 # include <va/va_drm.h>
 
+#include "opencv2/core.hpp"  // cv::format()
+
 namespace va {
 
 bool openDisplay();
@@ -70,10 +72,9 @@ static unsigned readId(const char* devName, const char* idName)
 {
     long int id = 0;
 
-    char fileName[256];
-    snprintf(fileName, sizeof(fileName), "%s/%s/%s", VA_INTEL_PCI_DIR, devName, idName);
+    std::string fileName = cv::format("%s/%s/%s", VA_INTEL_PCI_DIR, devName, idName);
 
-    FILE* file = fopen(fileName, "r");
+    FILE* file = fopen(fileName.c_str(), "r");
     if (file)
     {
         char str[16] = "";
@@ -87,7 +88,6 @@ static unsigned readId(const char* devName, const char* idName)
 static int findAdapter(unsigned desiredVendorId)
 {
     int adapterIndex = -1;
-    int numAdapters = 0;
 
     Directory dir(VA_INTEL_PCI_DIR);
 
@@ -101,10 +101,17 @@ static int findAdapter(unsigned desiredVendorId)
             unsigned vendorId = readId(name, "vendor");
             if (vendorId == desiredVendorId)
             {
-                adapterIndex = numAdapters;
+                std::string subdirName = cv::format("%s/%s/%s", VA_INTEL_PCI_DIR, name, "drm");
+                Directory subdir(subdirName.c_str());
+                for (int j = 0; j < subdir.count(); ++j)
+                {
+                    if (!strncmp(subdir[j]->d_name, "card", 4))
+                    {
+                        adapterIndex = strtoul(subdir[j]->d_name + 4, NULL, 10);
+                    }
+                }
                 break;
             }
-            ++numAdapters;
         }
     }
 
@@ -123,18 +130,12 @@ public:
             numbers[1] = adapterIndex;
             for (int i = 0;  i < NUM_NODES;  ++i)
             {
-                int sz = sizeof(VA_INTEL_DRI_DIR) + strlen(names[i]) + 3;
-                paths[i] = new char [sz];
-                snprintf(paths[i], sz, "%s%s%d", VA_INTEL_DRI_DIR, names[i], numbers[i]);
+                paths[i] = cv::format("%s%s%d", VA_INTEL_DRI_DIR, names[i], numbers[i]);
             }
         }
     ~NodeInfo()
         {
-            for (int i = 0;  i < NUM_NODES;  ++i)
-            {
-                delete paths[i];
-                paths[i] = 0;
-            }
+            // nothing
         }
     int count() const
         {
@@ -142,10 +143,10 @@ public:
         }
     const char* path(int index) const
         {
-            return ((index >= 0) && (index < NUM_NODES)) ? paths[index] : 0;
+            return ((index >= 0) && (index < NUM_NODES)) ? paths[index].c_str() : 0;
         }
 private:
-    char* paths[NUM_NODES];
+    std::string paths[NUM_NODES];
 };
 
 static bool openDeviceIntel();