diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 1c30cccf4b..7f229cde96 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -171,6 +171,8 @@ elseif(MSVC)
     set(OpenCV_RUNTIME vc15)
   elseif(MSVC_VERSION MATCHES "^192[0-9]$")
     set(OpenCV_RUNTIME vc16)
+  elseif(MSVC_VERSION MATCHES "^193[0-9]$")
+    set(OpenCV_RUNTIME vc17)
   else()
     message(WARNING "OpenCV does not recognize MSVC_VERSION \"${MSVC_VERSION}\". Cannot set OpenCV_RUNTIME")
   endif()
diff --git a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
index 5da438a9ee..b0f254ebe8 100644
--- a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
+++ b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
@@ -137,6 +137,20 @@ elseif(MSVC)
         set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
       endif()
     endif()
+  elseif(MSVC_VERSION MATCHES "^193[0-9]$")
+    set(OpenCV_RUNTIME vc17)
+    check_one_config(has_VS2022)
+    if(NOT has_VS2022)
+     set(OpenCV_RUNTIME vc16)
+     check_one_config(has_VS2019)
+     if(NOT has_VS2019)
+       set(OpenCV_RUNTIME vc15) # selecting previous compatible runtime version
+       check_one_config(has_VS2017)
+       if(NOT has_VS2017)
+         set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
+       endif()
+     endif()
+    endif()
   endif()
 elseif(MINGW)
   set(OpenCV_RUNTIME mingw)
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 8fccd19798..fe64d5d7c9 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -57,8 +57,8 @@
 #endif
 
 #if defined __unix__ || defined __APPLE__ || defined __GLIBC__ \
-    || defined __HAIKU__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ \
-    || defined __OpenBSD__
+    || defined __HAIKU__ || defined __EMSCRIPTEN__ \
+    || defined __FreeBSD__ || defined __NetBSD__ || defined __OpenBSD__
     #include <unistd.h>
     #include <stdio.h>
     #include <sys/types.h>
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 9d35939964..f3f47e41de 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -55,6 +55,18 @@
 
 #include <opencv2/core/utils/filesystem.private.hpp>
 
+#ifndef OPENCV_WITH_THREAD_SANITIZER
+  #if defined(__clang__) && defined(__has_feature)
+  #if __has_feature(thread_sanitizer)
+      #define OPENCV_WITH_THREAD_SANITIZER 1
+      #include <atomic>  // assume C++11
+  #endif
+  #endif
+#endif
+#ifndef OPENCV_WITH_THREAD_SANITIZER
+    #define OPENCV_WITH_THREAD_SANITIZER 0
+#endif
+
 namespace cv {
 
 static void _initSystem()
@@ -1383,64 +1395,62 @@ namespace details {
 #endif
 #endif
 
-template <class T>
-class DisposedSingletonMark
-{
-private:
-    static bool mark;
-protected:
-    DisposedSingletonMark() {}
-    ~DisposedSingletonMark()
-    {
-        mark = true;
-    }
-public:
-    static bool isDisposed() { return mark; }
-};
-
 // TLS platform abstraction layer
-class TlsAbstraction : public DisposedSingletonMark<TlsAbstraction>
+class TlsAbstraction
 {
 public:
     TlsAbstraction();
-    ~TlsAbstraction();
-    void* getData() const
+    ~TlsAbstraction()
     {
-        if (isDisposed())  // guard: static initialization order fiasco
-            return NULL;
-        return getData_();
-    }
-    void setData(void *pData)
-    {
-        if (isDisposed())  // guard: static initialization order fiasco
-            return;
-        return setData_(pData);
+        // TlsAbstraction singleton should not be released
+        // There is no reliable way to avoid problems caused by static initialization order fiasco
+        // NB: Do NOT use logging here
+        fprintf(stderr, "OpenCV FATAL: TlsAbstraction::~TlsAbstraction() call is not expected\n");
+        fflush(stderr);
     }
 
+    void* getData() const;
+    void setData(void *pData);
+
+    void releaseSystemResources();
+
 private:
-    void* getData_() const;
-    void setData_(void *pData);
 
 #ifdef _WIN32
 #ifndef WINRT
     DWORD tlsKey;
+    bool disposed;
 #endif
 #else // _WIN32
     pthread_key_t  tlsKey;
+#if OPENCV_WITH_THREAD_SANITIZER
+    std::atomic<bool> disposed;
+#else
+    bool disposed;
+#endif
 #endif
 };
 
-template<> bool DisposedSingletonMark<TlsAbstraction>::mark = false;
-
-static TlsAbstraction& getTlsAbstraction_()
+class TlsAbstractionReleaseGuard
 {
-    static TlsAbstraction g_tls;  // disposed in atexit() handlers (required for unregistering our callbacks)
-    return g_tls;
-}
+    TlsAbstraction& tls_;
+public:
+    TlsAbstractionReleaseGuard(TlsAbstraction& tls) : tls_(tls)
+    {
+        /* nothing */
+    }
+    ~TlsAbstractionReleaseGuard()
+    {
+        tls_.releaseSystemResources();
+    }
+};
+
+// TODO use reference
 static TlsAbstraction* getTlsAbstraction()
 {
-    static TlsAbstraction* instance = &getTlsAbstraction_();
-    return DisposedSingletonMark<TlsAbstraction>::isDisposed() ? NULL : instance;
+    static TlsAbstraction *g_tls = new TlsAbstraction();  // memory leak is intended here to avoid disposing of TLS container
+    static TlsAbstractionReleaseGuard g_tlsReleaseGuard(*g_tls);
+    return g_tls;
 }
 
 
@@ -1448,15 +1458,15 @@ static TlsAbstraction* getTlsAbstraction()
 #ifdef WINRT
 static __declspec( thread ) void* tlsData = NULL; // using C++11 thread attribute for local thread data
 TlsAbstraction::TlsAbstraction() {}
-TlsAbstraction::~TlsAbstraction()
+void TlsAbstraction::releaseSystemResources()
 {
     cv::__termination = true;  // DllMain is missing in static builds
 }
-void* TlsAbstraction::getData_() const
+void* TlsAbstraction::getData() const
 {
     return tlsData;
 }
-void TlsAbstraction::setData_(void *pData)
+void TlsAbstraction::setData(void *pData)
 {
     tlsData = pData;
 }
@@ -1465,6 +1475,7 @@ void TlsAbstraction::setData_(void *pData)
 static void NTAPI opencv_fls_destructor(void* pData);
 #endif // CV_USE_FLS
 TlsAbstraction::TlsAbstraction()
+    : disposed(false)
 {
 #ifndef CV_USE_FLS
     tlsKey = TlsAlloc();
@@ -1473,9 +1484,10 @@ TlsAbstraction::TlsAbstraction()
 #endif // CV_USE_FLS
     CV_Assert(tlsKey != TLS_OUT_OF_INDEXES);
 }
-TlsAbstraction::~TlsAbstraction()
+void TlsAbstraction::releaseSystemResources()
 {
     cv::__termination = true;  // DllMain is missing in static builds
+    disposed = true;
 #ifndef CV_USE_FLS
     TlsFree(tlsKey);
 #else // CV_USE_FLS
@@ -1483,16 +1495,20 @@ TlsAbstraction::~TlsAbstraction()
 #endif // CV_USE_FLS
     tlsKey = TLS_OUT_OF_INDEXES;
 }
-void* TlsAbstraction::getData_() const
+void* TlsAbstraction::getData() const
 {
+    if (disposed)
+        return NULL;
 #ifndef CV_USE_FLS
     return TlsGetValue(tlsKey);
 #else // CV_USE_FLS
     return FlsGetValue(tlsKey);
 #endif // CV_USE_FLS
 }
-void TlsAbstraction::setData_(void *pData)
+void TlsAbstraction::setData(void *pData)
 {
+    if (disposed)
+        return;  // no-op
 #ifndef CV_USE_FLS
     CV_Assert(TlsSetValue(tlsKey, pData) == TRUE);
 #else // CV_USE_FLS
@@ -1503,12 +1519,14 @@ void TlsAbstraction::setData_(void *pData)
 #else // _WIN32
 static void opencv_tls_destructor(void* pData);
 TlsAbstraction::TlsAbstraction()
+    : disposed(false)
 {
     CV_Assert(pthread_key_create(&tlsKey, opencv_tls_destructor) == 0);
 }
-TlsAbstraction::~TlsAbstraction()
+void TlsAbstraction::releaseSystemResources()
 {
     cv::__termination = true;  // DllMain is missing in static builds
+    disposed = true;
     if (pthread_key_delete(tlsKey) != 0)
     {
         // Don't use logging here
@@ -1516,12 +1534,16 @@ TlsAbstraction::~TlsAbstraction()
         fflush(stderr);
     }
 }
-void* TlsAbstraction::getData_() const
+void* TlsAbstraction::getData() const
 {
+    if (disposed)
+        return NULL;
     return pthread_getspecific(tlsKey);
 }
-void TlsAbstraction::setData_(void *pData)
+void TlsAbstraction::setData(void *pData)
 {
+    if (disposed)
+        return;  // no-op
     CV_Assert(pthread_setspecific(tlsKey, pData) == 0);
 }
 #endif
@@ -1549,6 +1571,7 @@ public:
     TlsStorage() :
         tlsSlotsSize(0)
     {
+        (void)getTlsAbstraction();  // ensure singeton initialization (for correct order of atexit calls)
         tlsSlots.reserve(32);
         threads.reserve(32);
         g_isTlsStorageInitialized = true;
@@ -1786,11 +1809,11 @@ static void WINAPI opencv_fls_destructor(void* pData)
 #endif // CV_USE_FLS
 #endif // _WIN32
 
-static TlsAbstraction* const g_force_initialization_of_TlsAbstraction
+static TlsStorage* const g_force_initialization_of_TlsStorage
 #if defined __GNUC__
     __attribute__((unused))
 #endif
-    = getTlsAbstraction();
+    = &getTlsStorage();
 
 
 #else  // OPENCV_DISABLE_THREAD_SUPPORT
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index ec2904ee69..bb6fd5f5cc 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -48,6 +48,7 @@
 #include "../ie_ngraph.hpp"
 #include "../op_vkcom.hpp"
 
+#include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>
 
 #include "opencv2/core/hal/hal.hpp"
@@ -1736,7 +1737,26 @@ public:
             config.pad = pad;
             config.stride = stride;
             config.dilation = dilation;
+            if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
+            {
+                static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
+                if (!bypassCheck)
+                {
+                    CV_LOG_ERROR(NULL, "DNN/OpenCL: Unsupported configuration: inputs[0].dims=" << inputs[0].dims << "  umat_blobs[0].dims=" << umat_blobs[0].dims
+                        << ". Consider reporting complete reproducer to https://github.com/opencv/opencv/issues/20833."
+                        << " You can skip this check temporary through OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK=1"
+                    );
+                    return false;
+                }
+            }
             config.group = inputs[0].size[1] / umat_blobs[0].size[1];
+            if (config.group < 1)  // config.group == 0 causes div by zero in ocl4dnn code
+            {
+                CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group
+                    << ". Consider reporting complete reproducer to https://github.com/opencv/opencv/issues/20833"
+                );
+                return false;
+            }
             config.bias_term = umat_blobs.size() == 2;
             config.use_half = use_half;
 
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index d6fb83becb..bf5fba71a1 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -222,8 +222,6 @@ class OCL4DNNConvSpatial
         bool createDWConvKernel(int32_t blockWidth,
                                 int32_t blockHeight,
                                 int32_t blockDepth);
-        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                             int32_t offset, int32_t size, bool write_only);
         bool convolve(const UMat &bottom, UMat &top,
                       const UMat &weight, const UMat &bias,
                       int32_t numImages,
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index caf1c97a48..beb5fcd96d 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -167,6 +167,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     channels_   = config.in_shape[dims - spatial_dims - 1];
     num_output_ = config.out_shape[dims - spatial_dims - 1];
     group_ = config.group;
+    CV_CheckGT(group_, 0, "");  // avoid div by zero below
 
     fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
     fused_eltwise_ = false;
@@ -218,14 +219,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
 #endif
         if (!use_cache_path_)
         {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cerr
-                    << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
-                    << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_);
         }
     }
 
@@ -418,7 +412,6 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         addDef("CHANNELS", channels_ / group_);
         addDef("APPLY_BIAS", bias_term_);
         addDef("OUTPUT_Z", M_);
-        addDef("ZPAR", 1);
         setFusionDefine(fused_activ_, fused_eltwise_);
 
         src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@@ -672,8 +665,7 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
                       int r, int c, int interleavedRows, int nonInterleavedRows,
                       int blockWidth, int rowAlignment )
 {
-    CHECK_EQ(interleavedRows % 2, 0) <<
-             "interleaveMatrix only supports even values for interleavedRows.";
+    CV_Check(interleavedRows, interleavedRows % 2 == 0, "interleaveMatrix only supports even values for interleavedRows.");
 
     size_t memSize = r * c * sizeof(float);
     size_t dstSize = memSize *
@@ -685,9 +677,12 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
     const int yStride = c * 2;
     const Dtype *pSrc = mem;
     Dtype* pDst = mem_dst;
-    for (int y = 0; y < r;) {
-        for (int rows = 0; rows < interleavedRows; rows += 2) {
-            if ( y >= r ) break;
+    for (int y = 0; y < r;)
+    {
+        for (int rows = 0; rows < interleavedRows; rows += 2)
+        {
+            if (y >= r)
+                break;
             if ((c % xStride) == 0) {
                 for (int x = 0; x < c / xStride; x++) {
                     memcpy(pDst + x * xStride * 2,                         // NOLINT
@@ -712,11 +707,14 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
             y += 2;
         }
 
-        for (int rows = 0; rows < nonInterleavedRows; rows++) {
-            if (y >= r) break;
+        for (int rows = 0; rows < nonInterleavedRows; rows++)
+        {
+            if (y >= r)
+                break;
             const int stride = rowAlignment;
             int remaining = c;
-            for (int x = 0; x < c; x += stride) {
+            for (int x = 0; x < c; x += stride)
+            {
                 if (remaining >= stride) {
                     memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype));    // NOLINT
                     remaining -=stride;
@@ -852,34 +850,6 @@ bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
         return false;
 }
 
-template<>
-void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                                                int32_t offset, int32_t size, bool write_only)
-{
-    cl_mem sub_mem;
-    cl_buffer_region region;
-    cl_int err;
-    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
-
-    region.origin = offset * element_size + buffer.offset;
-    region.size = size * element_size;
-    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
-                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
-                                CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-    if (err)
-    {
-        std::cout << "Failed to create sub buffer." << std::endl;
-        return;
-    }
-
-    int step = element_size, rows = size, cols = 1;
-    ocl::convertFromBuffer(sub_mem, step, rows, cols,
-                           (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
-
-    //decrease ocl mem refcount
-    clReleaseMemObject(sub_mem);
-}
-
 template<>
 bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                                          const UMat &weight, const UMat &bias,
@@ -938,7 +908,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
             kernel.set(argIdx++, (uint16_t)output_h_);
             if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
             {
-                std::cout << "IDLF kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: IDLF kernel run failed");
                 return false;
             }
         }
@@ -1012,7 +982,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
 
             if (!kernel.run_(3, global_size, config->local_work_size, false))
             {
-                std::cout << "GEMM like kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: GEMM like kernel run failed");
                 return false;
             }
         }
@@ -1115,14 +1085,9 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
     {
         queue = cv::ocl::Queue::getDefault();
     }
-    catch (const cv::Exception&)
+    catch (const std::exception& e)
     {
-        static int warn_ = 0;
-        if (!warn_)
-        {
-            std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
-            warn_ = true;
-        }
+        CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning: " << e.what());
         return 1e6;
     }
 
@@ -1326,9 +1291,9 @@ ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
     phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
     if (!program.ptr())
     {
-        std::cout << "Failed to compile kernel: " << kernel_name_
-                  << ", buildflags: " << options
-                  << ", errmsg: " << errmsg << std::endl;
+        CV_LOG_WARNING(NULL, "DNN/OpenCL: Failed to compile kernel: " << kernel_name_
+            << ", buildflags: '" << options << "', errmsg: '" << errmsg << "'"
+        );
     }
     return program;
 }
@@ -1754,7 +1719,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                     fastestTime = kernelQueue[x]->executionTime;
                 }
             }
-            if (fastestKernel < 0) break;
+            if (fastestKernel < 0)
+                break;
             // Test fastest kernel
             bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
             if (verified == true) {
@@ -1913,17 +1879,18 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
     {
         if (z == 1)
             z = 16;
-        CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
+        CV_Check(z, z == 16 || z == 8, "DNN/OpenCL: IDLF - invalid SIMD size");
     }
     kernelQueue.clear();
     createConvolutionKernel(type, x, y, z);
-    if (kernelQueue.size() != 1) {
-        std::cerr << "Failed setup kernel by config:"
+    if (kernelQueue.size() != 1)
+    {
+        CV_LOG_ERROR(NULL, "DNN/OpenCL: Failed setup kernel by config: "
             << " x = " << x
             << " y = " << y
             << " z = " << z
             << " type = " << type
-            << std::endl;
+        );
         return false;
     }
     bestKernelConfig = kernelQueue[0];
@@ -1955,13 +1922,9 @@ bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
     {
         if (cache_path_.empty())
         {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
-                          << "                 via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_WARNING(NULL, "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory "
+                "through OPENCV_OCL4DNN_CONFIG_PATH parameter."
+            );
         }
         return false;
     }
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index eb5d354020..c9ddacfb8e 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -161,23 +161,15 @@ __kernel void ConvolveBasic(
     const int out_idx = get_global_id(0);  // 1D task layout: [output_width * output_height * OUTPUT_Z]
     const int plane_size = output_width * output_height;
     const int out_plane_idx = out_idx % plane_size;
-    const int outputZ = out_idx / plane_size;
+    const int outputZ = out_idx / plane_size;  // kernelNum
     const int outputY = out_plane_idx / output_width;
     const int outputX = out_plane_idx % output_width;
-    const int kernelNum = outputZ * ZPAR;
-    if (kernelNum < OUTPUT_Z)
+    if (outputZ < OUTPUT_Z)
     {
-        Dtype sum[ZPAR];
-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            sum[kern] = 0.0f;
-        }
+        Dtype sum = 0.0f;
         const int org_y = outputY * STRIDE_Y - pad_h;
         const int org_x = outputX * STRIDE_X - pad_w;
-        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
-#if APPLY_BIAS
-        const int biasIndex = bias_offset + kernelNum;
-#endif
+        const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
         const int local_image_offset = org_y * input_width + org_x;
         const int imageSize = input_width * input_height;
         __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
@@ -186,17 +178,13 @@ __kernel void ConvolveBasic(
         {
             for (int y = 0; y < KERNEL_HEIGHT; y++)
             {
+                int y_ = org_y + y * DILATION_Y;
                 for (int x = 0; x < KERNEL_WIDTH; x++)
                 {
-                    int y_ = org_y + y * DILATION_Y;
                     int x_ = org_x + x * DILATION_X;
-                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
+                    if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)
                     {
-                        continue;
-                    }
-                    for (int kern = 0; kern < ZPAR; kern++)
-                    {
-                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
+                        sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);
                     }
                 }
                 image_dataPtr += input_width * DILATION_Y;
@@ -205,18 +193,13 @@ __kernel void ConvolveBasic(
             image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
         }
 
-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            if (kernelNum + kern < OUTPUT_Z)
-            {
-                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
+        int offset = convolved_image_offset + out_idx;
 #if APPLY_BIAS
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
+        int biasIndex = bias_offset + outputZ;
+        ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);
 #else
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
+        ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);
 #endif
-            }
-        }
     }
 }
 
diff --git a/modules/features2d/src/orb.cpp b/modules/features2d/src/orb.cpp
index 85d17cdd0d..ae1e611127 100644
--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp
@@ -131,12 +131,17 @@ static void
 HarrisResponses(const Mat& img, const std::vector<Rect>& layerinfo,
                 std::vector<KeyPoint>& pts, int blockSize, float harris_k)
 {
-    CV_Assert( img.type() == CV_8UC1 && blockSize*blockSize <= 2048 );
+    CV_CheckTypeEQ(img.type(), CV_8UC1, "");
+    CV_CheckGT(blockSize, 0, "");
+    CV_CheckLE(blockSize*blockSize, 2048, "");
 
     size_t ptidx, ptsize = pts.size();
 
     const uchar* ptr00 = img.ptr<uchar>();
-    int step = (int)(img.step/img.elemSize1());
+    size_t size_t_step = img.step;
+    CV_CheckLE(size_t_step * blockSize + blockSize + 1, (size_t)INT_MAX, "");  // ofs computation, step+1
+    int step = static_cast<int>(size_t_step);
+
     int r = blockSize/2;
 
     float scale = 1.f/((1 << 2) * blockSize * 255.f);
@@ -154,7 +159,7 @@ HarrisResponses(const Mat& img, const std::vector<Rect>& layerinfo,
         int y0 = cvRound(pts[ptidx].pt.y);
         int z = pts[ptidx].octave;
 
-        const uchar* ptr0 = ptr00 + (y0 - r + layerinfo[z].y)*step + x0 - r + layerinfo[z].x;
+        const uchar* ptr0 = ptr00 + (y0 - r + layerinfo[z].y)*size_t_step + (x0 - r + layerinfo[z].x);
         int a = 0, b = 0, c = 0;
 
         for( int k = 0; k < blockSize*blockSize; k++ )
diff --git a/modules/features2d/test/test_orb.cpp b/modules/features2d/test/test_orb.cpp
index 8a4d9776b3..89e2f7d78b 100644
--- a/modules/features2d/test/test_orb.cpp
+++ b/modules/features2d/test/test_orb.cpp
@@ -141,5 +141,31 @@ TEST(Features2D_ORB, regression_16197)
     ASSERT_NO_THROW(orbPtr->detectAndCompute(img, noArray(), kps, fv));
 }
 
+// https://github.com/opencv/opencv-python/issues/537
+BIGDATA_TEST(Features2D_ORB, regression_opencv_python_537)  // memory usage: ~3 Gb
+{
+    applyTestTag(
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG,
+        CV_TEST_TAG_MEMORY_6GB
+    );
+
+    const int width = 25000;
+    const int height = 25000;
+    Mat img(Size(width, height), CV_8UC1, Scalar::all(0));
+
+    const int border = 23, num_lines = 23;
+    for (int i = 0; i < num_lines; i++)
+    {
+        cv::Point2i point1(border + i * 100, border + i * 100);
+        cv::Point2i point2(width - border - i * 100, height - border * i * 100);
+        cv::line(img, point1, point2, 255, 1, LINE_AA);
+    }
+
+    Ptr<ORB> orbPtr = ORB::create(31);
+    std::vector<KeyPoint> kps;
+    Mat fv;
+    ASSERT_NO_THROW(orbPtr->detectAndCompute(img, noArray(), kps, fv));
+}
 
 }} // namespace
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index 39cd412407..dd91bf1db4 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -860,7 +860,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
             }
             else
             {
-                CV_WARN("Error opening file: " << filename << " (" << err->message << ")");
+                CV_WARN("Error opening file: " << filename << " (" << (err ? err->message : "<unknown reason>") << ")");
                 return false;
             }
         }
@@ -868,9 +868,9 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
         {
             GSafePtr<GError> err;
             uridecodebin.attach(gst_parse_launch(filename, err.getRef()));
-            if (err)
+            if (!uridecodebin)
             {
-                CV_WARN("Error opening bin: " << err->message);
+                CV_WARN("Error opening bin: " << (err ? err->message : "<unknown reason>"));
                 return false;
             }
             manualpipeline = true;
@@ -2073,7 +2073,7 @@ void handleMessage(GstElement * pipeline)
                 gst_message_parse_error(msg, err.getRef(), debug.getRef());
                 GSafePtr<gchar> name; name.attach(gst_element_get_name(GST_MESSAGE_SRC (msg)));
                 CV_WARN("Embedded video playback halted; module " << name.get() <<
-                        " reported: " << err->message);
+                        " reported: " << (err ? err->message : "<unknown reason>"));
                 CV_LOG_DEBUG(NULL, "GStreamer debug: " << debug.get());
 
                 gst_element_set_state(GST_ELEMENT(pipeline), GST_STATE_NULL);
diff --git a/platforms/scripts/valgrind.supp b/platforms/scripts/valgrind.supp
index 10ea856e9d..c71cdf965d 100644
--- a/platforms/scripts/valgrind.supp
+++ b/platforms/scripts/valgrind.supp
@@ -51,7 +51,7 @@
    OpenCV-TLS-getTlsStorage
    Memcheck:Leak
    ...
-   fun:_ZN2cvL13getTlsStorageEv
+   fun:_ZN2cv*L13getTlsStorageEv
 }
 
 {
@@ -159,7 +159,7 @@
    OpenCV-DNN-getLayerFactoryMutex
    Memcheck:Leak
    ...
-   fun:_ZN2cv3dnn*L20getLayerFactoryMutexEv
+   fun:_ZN2cv3dnn*20getLayerFactoryMutexEv
 }
 
 {