diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp
index cb658e8af0..31182a029a 100644
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@@ -1932,4 +1932,34 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc
 
 #endif // OPENCV_IMGPROC_HAL_INTERFACE_H
 
+// The optimized branch was developed for old armv7 processors
+#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
+inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
+                       const short* prev_deriv_data, size_t prev_deriv_step,
+                       const uchar* next_data, size_t next_step,
+                       int width, int height, int cn,
+                       const float *prev_points, float *next_points, size_t point_count,
+                       uchar *status, float *err,
+                       const int win_width, const int win_height,
+                       int termination_count, double termination_epsilon,
+                       bool get_min_eigen_vals,
+                       float min_eigen_vals_threshold)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn,
+        prev_data, prev_data_step, prev_deriv_data, prev_deriv_step,
+        next_data, next_step,
+        point_count, prev_points, next_points,
+        status, err, CAROTENE_NS::Size2D(win_width, win_height),
+        termination_count, termination_epsilon,
+        get_min_eigen_vals, min_eigen_vals_threshold);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_LKOpticalFlowLevel
+#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
+#endif // __ARM_ARCH=7
+
 #endif
diff --git a/3rdparty/carotene/include/carotene/functions.hpp b/3rdparty/carotene/include/carotene/functions.hpp
index 76d1328194..8a4fa3efdd 100644
--- a/3rdparty/carotene/include/carotene/functions.hpp
+++ b/3rdparty/carotene/include/carotene/functions.hpp
@@ -2485,7 +2485,7 @@ namespace CAROTENE_NS {
                            u8 *status, f32 *err,
                            const Size2D &winSize,
                            u32 terminationCount, f64 terminationEpsilon,
-                           u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                           bool getMinEigenVals,
                            f32 minEigThreshold);
 }
 
diff --git a/3rdparty/carotene/src/opticalflow.cpp b/3rdparty/carotene/src/opticalflow.cpp
index 7b29742c84..463ba77fa0 100644
--- a/3rdparty/carotene/src/opticalflow.cpp
+++ b/3rdparty/carotene/src/opticalflow.cpp
@@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
                        u8 *status, f32 *err,
                        const Size2D &winSize,
                        u32 terminationCount, f64 terminationEpsilon,
-                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                       bool getMinEigenVals,
                        f32 minEigThreshold)
 {
     internal::assertSupportedConfiguration();
@@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
 
     for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
     {
-        f32 levscale = (1./(1 << level));
         u32 ptref = ptidx << 1;
-        f32 prevPtX = prevPts[ptref+0]*levscale;
-        f32 prevPtY = prevPts[ptref+1]*levscale;
-        f32 nextPtX;
-        f32 nextPtY;
-        if( level == maxLevel )
-        {
-            if( useInitialFlow )
-            {
-                nextPtX = nextPts[ptref+0]*levscale;
-                nextPtY = nextPts[ptref+1]*levscale;
-            }
-            else
-            {
-                nextPtX = prevPtX;
-                nextPtY = prevPtY;
-            }
-        }
-        else
-        {
-            nextPtX = nextPts[ptref+0]*2.f;
-            nextPtY = nextPts[ptref+1]*2.f;
-        }
-        nextPts[ptref+0] = nextPtX;
-        nextPts[ptref+1] = nextPtY;
+        f32 prevPtX = prevPts[ptref+0];
+        f32 prevPtY = prevPts[ptref+1];
+        f32 nextPtX = nextPts[ptref+0];
+        f32 nextPtY = nextPts[ptref+1];
 
         s32 iprevPtX, iprevPtY;
         s32 inextPtX, inextPtY;
@@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
         if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
             iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
         {
-            if( level == 0 )
-            {
-                if( status )
-                    status[ptidx] = false;
-                if( err )
-                    err[ptidx] = 0;
-            }
+            if( status )
+                status[ptidx] = false;
+            if( err )
+                err[ptidx] = 0;
             continue;
         }
 
@@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
 
         if( minEig < minEigThreshold || D < FLT_EPSILON )
         {
-            if( level == 0 && status )
+            if( status )
                 status[ptidx] = false;
             continue;
         }
@@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
             if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
                inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
             {
-                if( level == 0 && status )
+                if( status )
                     status[ptidx] = false;
                 break;
             }
@@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
             prevDeltaX = deltaX;
             prevDeltaY = deltaY;
         }
-
-        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
+        if( status && status[ptidx] && err && !getMinEigenVals )
         {
             f32 nextPointX = nextPts[ptref+0] - halfWinX;
             f32 nextPointY = nextPts[ptref+1] - halfWinY;
@@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
     (void)winSize;
     (void)terminationCount;
     (void)terminationEpsilon;
-    (void)level;
-    (void)maxLevel;
-    (void)useInitialFlow;
     (void)getMinEigenVals;
     (void)minEigThreshold;
     (void)ptCount;
@@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
 }
 
 }//CAROTENE_NS
-
diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp
index 94104f0b71..db0ee05132 100644
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@@ -5,6 +5,8 @@
 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP
 
+struct cvhalFilter2D;
+
 namespace cv {
 
 namespace ndsrvp {
@@ -71,6 +73,34 @@ int threshold(const uchar* src_data, size_t src_step,
 #undef cv_hal_threshold
 #define cv_hal_threshold (cv::ndsrvp::threshold)
 
+// ################ filter ################
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit (cv::ndsrvp::filterInit)
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y);
+
+#undef cv_hal_filter
+#define cv_hal_filter (cv::ndsrvp::filter)
+
+int filterFree(cvhalFilter2D *context);
+
+#undef cv_hal_filterFree
+#define cv_hal_filterFree (cv::ndsrvp::filterFree)
+
 } // namespace ndsrvp
 
 } // namespace cv
diff --git a/3rdparty/ndsrvp/src/cvutils.cpp b/3rdparty/ndsrvp/src/cvutils.cpp
index 48e025488f..6afac5136d 100644
--- a/3rdparty/ndsrvp/src/cvutils.cpp
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
     return p;
 }
 
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
+{
+    int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
+    int16x4_t vone = (int16x4_t){1, 1, 1, 1};
+    int16x4_t vlen = (int16x4_t){len, len, len, len};
+    if(borderType == CV_HAL_BORDER_REPLICATE)
+        vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
+        if(len == 1)
+            return vzero;
+        do
+        {
+            int16x4_t vneg = -vp - 1 + vdelta;
+            int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
+            vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+        }
+        while( (long)(vp >= vlen) || (long)(vp < 0) );
+    }
+    else if(borderType == CV_HAL_BORDER_WRAP)
+    {
+        ndsrvp_assert(len > 0);
+        int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
+        int16x4_t vpos = vp % vlen;
+        vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    }
+    else if(borderType == CV_HAL_BORDER_CONSTANT)
+        vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
+    return vp;
+}
+
 } // namespace ndsrvp
 
 } // namespace cv
diff --git a/3rdparty/ndsrvp/src/cvutils.hpp b/3rdparty/ndsrvp/src/cvutils.hpp
index 8cf1476ed6..78bb11d95f 100644
--- a/3rdparty/ndsrvp/src/cvutils.hpp
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <string>
 #include <array>
+#include <vector>
 #include <climits>
 #include <algorithm>
 
@@ -26,16 +27,26 @@ namespace ndsrvp {
 void* fastMalloc(size_t size);
 void fastFree(void* ptr);
 int borderInterpolate(int p, int len, int borderType);
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);
 
 #ifndef MAX
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
 
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
 #define CV_MALLOC_ALIGN 64
 
+inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
 // error codes
 
 enum Error{
@@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
     return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
 }
 
+// expand
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+810 [  0  ] [  1  ] [  4  ] [  5  ]
+832 [  2  ] [  3  ] [  6  ] [  7  ]
+bb  [  0  ] [  1  ] [  2  ] [  3  ]
+tt  [  4  ] [  5  ] [  6  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs810 = __nds__zunpkd810(vs);
+    unsigned long vs832 = __nds__zunpkd832(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+bbbb[      0      ] [      1      ]
+bbtt[      2      ] [      3      ]
+ttbb[      4      ] [      5      ]
+tttt[      6      ] [      7      ]
+*/
+
+
+inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    unsigned long vsbb = __nds__pkbb32(vs831, vs820);
+    unsigned long vstt = __nds__pktt32(vs831, vs820);
+    *(unsigned long*)dst = __nds__pkbb16(0, vsbb);
+    *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
+    *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
+    *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
+}
+
+// float replacement
+
+inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
+{
+    c[0] = a[0] + b[0];
+    c[1] = a[1] + b[1];
+    c[2] = a[2] + b[2];
+    c[3] = a[3] + b[3];
+    c[4] = a[4] + b[4];
+    c[5] = a[5] + b[5];
+    c[6] = a[6] + b[6];
+    c[7] = a[7] + b[7];
+}
+
+/*
+    [1] [8] [23]
+    [24] [8]
+*/
+
+inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
+{
+    const int mask_frac = 0x007FFFFF;
+    const int mask_sign = 0x7FFFFFFF;
+    const int mask_lead = 0x40000000;
+    const int ofs_exp = 23;
+
+    uint32x2_t va01 = *(uint32x2_t*)a;
+    uint32x2_t va23 = *(uint32x2_t*)(a + 2);
+    uint32x2_t va45 = *(uint32x2_t*)(a + 4);
+    uint32x2_t va67 = *(uint32x2_t*)(a + 6);
+
+    uint32x2_t vaexp01 = va01 >> ofs_exp;
+    uint32x2_t vaexp23 = va23 >> ofs_exp;
+    uint32x2_t vaexp45 = va45 >> ofs_exp;
+    uint32x2_t vaexp67 = va67 >> ofs_exp;
+
+    uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
+
+    int16x4_t vb[2]; // fake signed for signed multiply
+    ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
+
+    vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
+    vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
+    vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
+    vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
+
+    uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
+    uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
+    uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
+    uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
+
+    vaexp01 += 8 - vaclz01;
+    vaexp23 += 8 - vaclz23;
+    vaexp45 += 8 - vaclz45;
+    vaexp67 += 8 - vaclz67;
+
+    vafrac01 <<= vaclz01;
+    vafrac23 <<= vaclz23;
+    vafrac45 <<= vaclz45;
+    vafrac67 <<= vaclz67;
+
+    *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
+    *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
+    *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
+    *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
+}
+
 // saturate
 
 template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
@@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v)     { return saturate_cas
 template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
 template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }
 
+inline double cast_ptr_to_double(const uchar* v, int depth) {
+    switch (depth) {
+        case CV_8U: return (double)*(uchar*)v;
+        case CV_8S: return (double)*(char*)v;
+        case CV_16U: return (double)*(ushort*)v;
+        case CV_16S: return (double)*(short*)v;
+        case CV_32S: return (double)*(int*)v;
+        case CV_32F: return (double)*(float*)v;
+        case CV_64F: return (double)*(double*)v;
+        case CV_16F: return (double)*(float*)v;
+        default: return 0;
+    }
+}
+
+template <typename _Tp>
+inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
+{
+    return ((_Tp*)(data + y * step))[x * cn];
+}
+
 // align
 
 inline long align(size_t v, int n)
diff --git a/3rdparty/ndsrvp/src/filter.cpp b/3rdparty/ndsrvp/src/filter.cpp
new file mode 100644
index 0000000000..89508eea11
--- /dev/null
+++ b/3rdparty/ndsrvp/src/filter.cpp
@@ -0,0 +1,321 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class FilterData
+{
+public:
+    FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType,
+        int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y)
+        : kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType),
+        kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y)
+    {
+    }
+
+    uchar *kernel_data;
+    size_t kernel_step; // bytes between rows(height)
+    int kernel_type, src_type, dst_type, borderType;
+    int kernel_width, kernel_height;
+    int max_width, max_height;
+    double delta;
+    int anchor_x, anchor_y;
+    std::vector<uchar> coords;
+    std::vector<float> coeffs;
+    int nz;
+    std::vector<uchar> padding;
+};
+
+static int countNonZero(const FilterData* ctx)
+{
+    int i, j, nz = 0;
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            if( ((float*)ker_row)[j] != 0.0 )
+                nz++;
+        }
+    }
+    return nz;
+}
+
+static void preprocess2DKernel(FilterData* ctx)
+{
+    int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type;
+    if(nz == 0)
+        nz = 1; // (0, 0) == 0 by default
+    ndsrvp_assert( ktype == CV_32F );
+
+    ctx->coords.resize(nz * 2);
+    ctx->coeffs.resize(nz);
+
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            float val = ((float*)ker_row)[j];
+            if( val == 0.0 )
+                continue;
+            ctx->coords[k * 2] = j;
+            ctx->coords[k * 2 + 1] = i;
+            ctx->coeffs[k++] = val;
+        }
+    }
+
+    ctx->nz = k;
+}
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace)
+{
+    int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type);
+    int cn = CV_MAT_CN(src_type), kdepth = kernel_type;
+
+    (void)allowSubmatrix;
+    (void)allowInplace;
+
+    if(delta - (int)delta != 0.0)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType,
+        kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y);
+
+    *context = (cvhalFilter2D*)ctx;
+
+    ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth);
+
+    preprocess2DKernel(ctx);
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y)
+{
+    FilterData *ctx = (FilterData*)context;
+
+    int cn = CV_MAT_CN(ctx->src_type);
+    int cnes = CV_ELEM_SIZE(ctx->src_type);
+    int ddepth = CV_MAT_DEPTH(ctx->dst_type);
+    float delta_sat = (uchar)(ctx->delta);
+    if(ddepth == CV_8U)
+        delta_sat = (float)saturate_cast<uchar>(ctx->delta);
+    else if(ddepth == CV_16U)
+        delta_sat = (float)saturate_cast<ushort>(ctx->delta);
+
+    // fetch original image data
+    const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes;
+    int ogn_step = src_step;
+
+    // ROI fully used in the computation
+    int cal_width = width + ctx->kernel_width - 1;
+    int cal_height = height + ctx->kernel_height - 1;
+    int cal_x = offset_x - ctx->anchor_x;
+    int cal_y = offset_y - ctx->anchor_y;
+
+    // calculate source border
+    ctx->padding.resize(cal_width * cal_height * cnes);
+    uchar* pad_data = &ctx->padding[0];
+    int pad_step = cal_width * cnes;
+
+    uchar* pad_ptr;
+    const uchar* ogn_ptr;
+    std::vector<uchar> vec_zeros(cnes, 0);
+    for(int i = 0; i < cal_height; i++)
+    {
+        int y = borderInterpolate(i + cal_y, full_height, ctx->borderType);
+        if(y < 0) {
+            memset(pad_data + i * pad_step, 0, cnes * cal_width);
+            continue;
+        }
+
+        // left border
+        int j = 0;
+        int16x4_t vj = {0, 1, 2, 3};
+        vj += saturate_cast<short>(cal_x);
+        for(; j + cal_x < -4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j + cal_x < 0; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+
+        // center
+        int rborder = MIN(cal_width, full_width - cal_x);
+        ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes;
+        pad_ptr = pad_data + i * pad_step + j * cnes;
+        memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j));
+
+        // right border
+        j = rborder;
+        vj = (int16x4_t){0, 1, 2, 3} + saturate_cast<short>(cal_x + rborder);
+        for(; j <= cal_width - 4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j < cal_width; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+    }
+
+    // prepare the pointers
+    int i, k, count, nz = ctx->nz;
+    const uchar* ker_pts = &ctx->coords[0];
+    const float* ker_cfs = &ctx->coeffs[0];
+
+    if( ddepth == CV_8U )
+    {
+        std::vector<uchar*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        uchar** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes;
+
+            i = 0;
+            for( ; i <= width * cnes - 8; i += 8 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    // experimental code
+                    // ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs);
+                    // ndsrvp_f32_add8(vs0, vker_cfs, vs0);
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                    vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4];
+                    vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5];
+                    vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6];
+                    vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7];
+                }
+                dst_row[i] = saturate_cast<uchar>(vs0[0]);
+                dst_row[i + 1] = saturate_cast<uchar>(vs0[1]);
+                dst_row[i + 2] = saturate_cast<uchar>(vs0[2]);
+                dst_row[i + 3] = saturate_cast<uchar>(vs0[3]);
+                dst_row[i + 4] = saturate_cast<uchar>(vs0[4]);
+                dst_row[i + 5] = saturate_cast<uchar>(vs0[5]);
+                dst_row[i + 6] = saturate_cast<uchar>(vs0[6]);
+                dst_row[i + 7] = saturate_cast<uchar>(vs0[7]);
+            }
+            for( ; i < width * cnes; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                dst_row[i] = saturate_cast<uchar>(s0);
+            }
+        }
+    }
+    else if( ddepth == CV_16U )
+    {
+        std::vector<ushort*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        ushort** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes);
+
+            i = 0;
+            for( ; i <= width * cn - 4; i += 4 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                }
+                ushort* dst_row_ptr = (ushort*)dst_row;
+                dst_row_ptr[i] = saturate_cast<ushort>(vs0[0]);
+                dst_row_ptr[i + 1] = saturate_cast<ushort>(vs0[1]);
+                dst_row_ptr[i + 2] = saturate_cast<ushort>(vs0[2]);
+                dst_row_ptr[i + 3] = saturate_cast<ushort>(vs0[3]);
+            }
+            for( ; i < width * cn; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                ((ushort*)dst_row)[i] = saturate_cast<ushort>(s0);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filterFree(cvhalFilter2D *context) {
+    FilterData *ctx = (FilterData*)context;
+    delete ctx;
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/zlib-ng/CMakeLists.txt b/3rdparty/zlib-ng/CMakeLists.txt
index c05511ca87..83e6dac542 100644
--- a/3rdparty/zlib-ng/CMakeLists.txt
+++ b/3rdparty/zlib-ng/CMakeLists.txt
@@ -1,12 +1,38 @@
-project(${ZLIB_LIBRARY} LANGUAGES C)
-
-if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES)
-  set(CMAKE_C_STANDARD 11)          # The C standard whose features are requested to build this target
+cmake_minimum_required(VERSION 3.5.1)
+if(CMAKE_VERSION VERSION_LESS 3.12)
+    cmake_policy(VERSION ${CMAKE_VERSION})
 else()
-  set(CMAKE_C_STANDARD 99)
+    cmake_policy(VERSION 3.5.1...3.29.0)
 endif()
-set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
-set(CMAKE_C_EXTENSIONS OFF)       # Boolean specifying whether compiler specific extensions are requested
+message(STATUS "Using CMake version ${CMAKE_VERSION}")
+
+# If not specified on the command line, enable C11 as the default
+# Configuration items that affect the global compiler environment standards
+# should be issued before the "project" command.
+if(NOT CMAKE_C_STANDARD)
+    set(CMAKE_C_STANDARD 11)          # The C standard whose features are requested to build this target
+endif()
+if(NOT CMAKE_C_STANDARD_REQUIRED)
+    set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
+endif()
+if(NOT CMAKE_C_EXTENSIONS)
+    set(CMAKE_C_EXTENSIONS OFF)       # Boolean specifying whether compiler specific extensions are requested
+endif()
+set(VALID_C_STANDARDS "99" "11")
+if(NOT CMAKE_C_STANDARD IN_LIST VALID_C_STANDARDS)
+    MESSAGE(FATAL_ERROR "CMAKE_C_STANDARD:STRING=${CMAKE_C_STANDARD} not in known standards list\n ${VALID_C_STANDARDS}")
+endif()
+
+# Parse the full version number from zlib.h.in and include in ZLIB_FULL_VERSION
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in _zlib_h_contents)
+string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([0-9]+.[0-9]+.[0-9]+).*\".*"
+        "\\1" ZLIB_HEADER_VERSION ${_zlib_h_contents})
+string(REGEX REPLACE ".*#define[ \t]+ZLIBNG_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*"
+        "\\1" ZLIBNG_HEADER_VERSION ${_zlib_h_contents})
+message(STATUS "ZLIB_HEADER_VERSION: ${ZLIB_HEADER_VERSION}")
+message(STATUS "ZLIBNG_HEADER_VERSION: ${ZLIBNG_HEADER_VERSION}")
+
+project(zlib VERSION ${ZLIB_HEADER_VERSION} LANGUAGES C)
 
 include(CheckTypeSize)
 include(CheckSymbolExists)
@@ -16,142 +42,325 @@ include(CheckCSourceCompiles)
 include(CheckCSourceRuns)
 include(CheckCCompilerFlag)
 include(CMakeDependentOption)
+include(CMakePackageConfigHelpers)
+include(FeatureSummary)
 
-if(X86_64 OR X86)
-  set(BASEARCH_X86_FOUND TRUE)
-endif()
-if(AARCH64 OR ARM)
-  set(BASEARCH_ARM_FOUND TRUE)
-endif()
-if(PPC64LE OR PPC64)
-  set(BASEARCH_PPC_FOUND TRUE)
-endif()
-if(RISCV)
-  set(BASEARCH_RISCV_FOUND TRUE)
-endif()
-
+include(cmake/detect-arch.cmake)
+include(cmake/detect-install-dirs.cmake)
+include(cmake/detect-coverage.cmake)
 include(cmake/detect-intrinsics.cmake)
+include(cmake/detect-sanitizer.cmake)
 include(cmake/fallback-macros.cmake)
 
-set(ZLIB_SYMBOL_PREFIX "")
-
-if(BASEARCH_X86_FOUND)
-  set(WITH_AVX2 ON)
-  set(WITH_AVX512 ON)
-  set(WITH_AVX512VNNI ON)
-  set(WITH_SSE2 ON)
-  set(WITH_SSSE3 ON)
-  set(WITH_SSE42 ON)
-  set(WITH_PCLMULQDQ ON)
-  set(WITH_VPCLMULQDQ ON)
+if(CMAKE_TOOLCHAIN_FILE)
+    message(STATUS "Using CMake toolchain: ${CMAKE_TOOLCHAIN_FILE}")
 endif()
+
+# Make sure we use an appropriate BUILD_TYPE by default, "Release" to be exact
+# this should select the maximum generic optimisation on the current platform (i.e. -O3 for gcc/clang)
+get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+if(NOT GENERATOR_IS_MULTI_CONFIG)
+    if(NOT CMAKE_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+            "Choose the type of build, standard options are: Debug Release RelWithDebInfo MinSizeRel."
+            FORCE)
+        add_feature_info(CMAKE_BUILD_TYPE 1 "Build type: ${CMAKE_BUILD_TYPE} (default)")
+    else()
+        add_feature_info(CMAKE_BUILD_TYPE 1 "Build type: ${CMAKE_BUILD_TYPE} (selected)")
+    endif()
+endif()
+
+#
+# Options parsing
+#
+option(WITH_GZFILEOP "Compile with support for gzFile related functions" ON)
+option(ZLIB_COMPAT "Compile with zlib compatible API" ON)
+option(ZLIB_ENABLE_TESTS "Build test binaries" OFF)
+option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" OFF)
+option(WITH_GTEST "Build gtest_zlib" OFF)
+option(WITH_FUZZERS "Build test/fuzz" OFF)
+option(WITH_BENCHMARKS "Build test/benchmarks" OFF)
+option(WITH_BENCHMARK_APPS "Build application benchmarks" OFF)
+option(WITH_OPTIM "Build with optimisation" ON)
+option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF)
+option(WITH_NEW_STRATEGIES "Use new strategies" ON)
+option(WITH_NATIVE_INSTRUCTIONS
+    "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF)
+option(WITH_RUNTIME_CPU_DETECTION "Build with runtime detection of CPU architecture" ON)
+option(WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings" OFF)
+option(WITH_CODE_COVERAGE "Enable code coverage reporting" OFF)
+option(WITH_INFLATE_STRICT "Build with strict inflate distance checking" OFF)
+option(WITH_INFLATE_ALLOW_INVALID_DIST "Build with zero fill for inflate invalid distances" OFF)
+option(WITH_UNALIGNED "Support unaligned reads on platforms that support it" ON)
+
+set(ZLIB_SYMBOL_PREFIX "" CACHE STRING "Give this prefix to all publicly exported symbols.
+Useful when embedding into a larger library.
+Default is no prefix (empty prefix).")
+
+# Add multi-choice option
+set(WITH_SANITIZER AUTO CACHE STRING "Enable sanitizer support")
+set_property(CACHE WITH_SANITIZER PROPERTY STRINGS "Memory" "Address" "Undefined" "Thread")
+
 if(BASEARCH_ARM_FOUND)
-  set(WITH_ACLE ON)
-  set(WITH_NEON ON)
-  if(ARM)
-    set(WITH_ARMV6 ON)
-  else()
-    set(WITH_ARMV6 OFF)
-  endif()
-endif()
-if(BASEARCH_PPC_FOUND)
-  set(WITH_ALTIVEC ON)
-  set(WITH_POWER8 ON)
-  set(WITH_POWER9 ON)
-endif()
-if(BASEARCH_RISCV_FOUND)
-  set(WITH_RVV ON)
+    option(WITH_ACLE "Build with ACLE" ON)
+    option(WITH_NEON "Build with NEON intrinsics" ON)
+    cmake_dependent_option(WITH_ARMV6 "Build with ARMv6 SIMD" ON "NOT ARCH STREQUAL \"aarch64\"" OFF)
+elseif(BASEARCH_PPC_FOUND)
+    option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
+    option(WITH_POWER8 "Build with optimisations for POWER8" ON)
+    option(WITH_POWER9 "Build with optimisations for POWER9" ON)
+elseif(BASEARCH_RISCV_FOUND)
+    option(WITH_RVV "Build with RVV intrinsics" ON)
+elseif(BASEARCH_S360_FOUND)
+    option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
+    option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
+    option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
+elseif(BASEARCH_X86_FOUND)
+    option(WITH_SSE2 "Build with SSE2" ON)
+    cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF)
+    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF)
+    cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
+    cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
+    cmake_dependent_option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON "WITH_AVX512" OFF)
+    cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX512" OFF)
 endif()
 
+option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF)
 
-add_definitions(-DZLIB_COMPAT)
+set(ZLIB_BUILD_SHARED_LIBS OFF)
+set(SKIP_INSTALL_ALL ON)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes -Wmissing-declarations -Wundef -Wstrict-prototypes -Wtype-limits)
+ocv_warnings_disable(CMAKE_C_FLAGS /wd4819 /wd4244 /wd4334)
 
-add_definitions(-DWITH_GZFILEOP)
+mark_as_advanced(FORCE
+    ZLIB_SYMBOL_PREFIX
+    WITH_REDUCED_MEM
+    WITH_ACLE WITH_NEON
+    WITH_ARMV6
+    WITH_DFLTCC_DEFLATE
+    WITH_DFLTCC_INFLATE
+    WITH_CRC32_VX
+    WITH_AVX2 WITH_SSE2
+    WITH_SSSE3 WITH_SSE42
+    WITH_PCLMULQDQ
+    WITH_ALTIVEC
+    WITH_POWER8
+    WITH_POWER9
+    WITH_RVV
+    WITH_INFLATE_STRICT
+    WITH_INFLATE_ALLOW_INVALID_DIST
+    WITH_UNALIGNED
+    INSTALL_UTILS
+    )
+
+if(ZLIB_COMPAT)
+    add_definitions(-DZLIB_COMPAT)
+    set(WITH_GZFILEOP ON)
+    set(SUFFIX "")
+    set(ZLIB_FULL_VERSION ${ZLIB_HEADER_VERSION}.zlib-ng)
+    set(EXPORT_NAME ZLIB)
+else()
+    set(SUFFIX "-ng")
+    set(ZLIB_FULL_VERSION ${ZLIBNG_HEADER_VERSION})
+    set(EXPORT_NAME zlib-ng)
+endif()
+
+if(WITH_GZFILEOP)
+    add_definitions(-DWITH_GZFILEOP)
+endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "^Intel")
-  set(WARNFLAGS_DISABLE)
+    if(CMAKE_HOST_UNIX)
+        set(WARNFLAGS -Wall)
+        set(WARNFLAGS_MAINTAINER -Wall -Wcheck -Wremarks)
+        set(WARNFLAGS_DISABLE)
+    else()
+        set(WARNFLAGS /Wall)
+        set(WARNFLAGS_MAINTAINER /W5)
+        set(WARNFLAGS_DISABLE)
+    endif()
+    check_c_compiler_flag(-diag-disable=10441 HAVE_DIAG_10441)
+    if(HAVE_DIAG_10441)
+        list(APPEND WARNFLAGS_DISABLE "-diag-disable=10441")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -diag-disable=10441")
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -diag-disable=10441")
+    endif()
 elseif(MSVC)
-  # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
-  # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
-  if(MSVC_VERSION VERSION_LESS 1800)
-    message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
-  endif()
-  # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
-  # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
-  # avoid mistakes.
-  # /Oi ?
-  set(WARNFLAGS_DISABLE)
-  if(BASEARCH_ARM_FOUND)
-      add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
-      if(NOT "${ARCH}" MATCHES "aarch64")
-          set(NEONFLAG "/arch:VFPv4")
-      endif()
-  endif()
-elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-  set(WARNFLAGS_DISABLE)
-  # Check whether -fno-lto is available
-  set(CMAKE_REQUIRED_FLAGS "-fno-lto")
-  check_c_source_compiles(
-    "int main() { return 0; }"
-    FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
-  set(CMAKE_REQUIRED_FLAGS)
-  if(FNO_LTO_AVAILABLE)
-    set(ZNOLTOFLAG "-fno-lto")
-  endif()
-  if(BASEARCH_ARM_FOUND)
-    if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
-      # Auto-detect support for ARM floating point ABI
-      check_include_file(features.h HAVE_FEATURES_H)
-      if(HAVE_FEATURES_H)
-        set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
-        check_c_source_compiles(
-          "#include <features.h>
-          int main() { return 0; }"
-          HAVE_FLOATABI_SOFTFP)
-        if(HAVE_FLOATABI_SOFTFP)
-          set(FLOATABI -mfloat-abi=softfp)
-        else()
-          set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
-          check_c_source_compiles(
-            "#include <features.h>
-            int main() { return 0; }"
-            HAVE_FLOATABI_HARD)
-          if(HAVE_FLOATABI_HARD)
-            set(FLOATABI -mfloat-abi=hard)
-          endif()
+    # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
+    # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+    if(MSVC_VERSION VERSION_LESS 1800)
+        message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
+    endif()
+    # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
+    # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
+    # avoid mistakes.
+    # /Oi ?
+    set(WARNFLAGS /W3)
+    set(WARNFLAGS_MAINTAINER /W4)
+    set(WARNFLAGS_DISABLE)
+    if(BASEARCH_ARM_FOUND)
+        add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
+        if(NOT "${ARCH}" MATCHES "aarch64")
+            set(NEONFLAG "/arch:VFPv4")
         endif()
-        set(CMAKE_REQUIRED_FLAGS)
-      endif()
-      if(FLOATABI)
-        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}")
-        add_compile_options(${FLOATABI})
-      else()
-        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected")
-      endif()
     endif()
-  endif()
-  if(FNO_LTO_AVAILABLE)
-    set(NOLTOFLAG ${ZNOLTOFLAG})
-  endif()
-  if(MINGW)
-    # Add `-Wno-pedantic-ms-format` only if the toolchain supports it
-    check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
-    if(HAVE_NO_PEDANTIC_MS_FORMAT)
-      list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
+elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+    # Enable warnings in GCC and Clang
+    set(WARNFLAGS -Wall)
+    set(WARNFLAGS_MAINTAINER -Wextra)
+    set(WARNFLAGS_DISABLE)
+    # Check whether -fno-lto is available
+    set(CMAKE_REQUIRED_FLAGS "-fno-lto")
+    check_c_source_compiles(
+        "int main() { return 0; }"
+        FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+    if(FNO_LTO_AVAILABLE)
+        set(ZNOLTOFLAG "-fno-lto")
+    endif()
+    if(NOT WITH_NATIVE_INSTRUCTIONS)
+        if(BASEARCH_ARM_FOUND)
+            if("${ARCH}" MATCHES "arm" AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
+                # Auto-detect support for ARM floating point ABI
+                check_include_file(features.h HAVE_FEATURES_H)
+                if(HAVE_FEATURES_H)
+                    set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
+                    check_c_source_compiles(
+                        "#include <features.h>
+                        int main() { return 0; }"
+                        HAVE_FLOATABI_SOFTFP)
+                    if(HAVE_FLOATABI_SOFTFP)
+                        set(FLOATABI -mfloat-abi=softfp)
+                    else()
+                        set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
+                        check_c_source_compiles(
+                            "#include <features.h>
+                            int main() { return 0; }"
+                            HAVE_FLOATABI_HARD)
+                        if(HAVE_FLOATABI_HARD)
+                            set(FLOATABI -mfloat-abi=hard)
+                        endif()
+                    endif()
+                    set(CMAKE_REQUIRED_FLAGS)
+                endif()
+                if(FLOATABI)
+                    message(STATUS "ARM floating point arch: ${FLOATABI}")
+                    add_compile_options(${FLOATABI})
+                else()
+                    message(STATUS "ARM floating point arch not auto-detected")
+                endif()
+            endif()
+        endif()
+        # Disable LTO unless Native Instructions are enabled
+        if(FNO_LTO_AVAILABLE)
+            set(NOLTOFLAG ${ZNOLTOFLAG})
+        endif()
+    endif()
+    if(MINGW)
+        # Add `-Wno-pedantic-ms-format` only if the toolchain supports it
+        check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
+        if(HAVE_NO_PEDANTIC_MS_FORMAT)
+            list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
+        endif()
     endif()
-  endif()
 endif()
 
-# Force disable LTO
-set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+# Set native march/mcpu
+if(WITH_NATIVE_INSTRUCTIONS)
+    if(NATIVE_ARCH_OVERRIDE)
+        message(STATUS "WARNING: WITH_NATIVE_INSTRUCTIONS enabled, but running with NATIVE_ARCH_OVERRIDE: ${NATIVE_ARCH_OVERRIDE}")
+        set(NATIVEFLAG "${NATIVE_ARCH_OVERRIDE}")
+    else()
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            check_c_compiler_flag(-march=native HAVE_MARCH_NATIVE)
+            if(HAVE_MARCH_NATIVE)
+                set(NATIVEFLAG "-march=native")
+            else()
+                check_c_compiler_flag(-mcpu=native HAVE_MCPU_NATIVE)
+                if(HAVE_MCPU_NATIVE)
+                    set(NATIVEFLAG "-mcpu=native")
+                endif()
+            endif()
+            # Fall through
+        endif()
+    endif()
+    if(NATIVEFLAG)
+        # Apply flags to all source files and compilation checks
+        if(WIN32)
+            separate_arguments(NATIVEOPTIONS WINDOWS_COMMAND "${NATIVEFLAG}")
+        else()
+            separate_arguments(NATIVEOPTIONS UNIX_COMMAND "${NATIVEFLAG}")
+        endif()
+        add_compile_options(${NATIVEOPTIONS})
+        set(WITH_RUNTIME_CPU_DETECTION OFF)
+    else()
+        message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not implemented yet on this configuration")
+        set(WITH_NATIVE_INSTRUCTIONS OFF)
+    endif()
+endif()
+
+# Compile without functable or CPU detection
+if(NOT WITH_RUNTIME_CPU_DETECTION)
+    if(MSVC AND BASEARCH_X86_FOUND)
+        message(STATUS "WARNING: Microsoft Visual Studio does not support compile time detection of CPU features for \"/arch\" before \"AVX\"")
+        # Workaround for MSVC. By default MSVC does not define the __SSE*__ macros.
+        # Fix it if AVX is enabled.
+        set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
+        check_c_source_compiles(
+            "#ifndef __AVX__
+            #  error \"AVX is not enabled.\"
+            #endif
+            int main(void) { return 0; }"
+            MSVC_IS_ENABLED_AVX
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+        if(MSVC_IS_ENABLED_AVX)
+            add_definitions(
+                -D__SSE__=1
+                -D__SSE2__=1
+                -D__SSE3__=1
+                -D__SSSE3__=1
+                -D__SSE4_1__=1
+                -D__SSE4_2__=1
+                -D__PCLMUL__=1
+            )
+        endif()
+    endif()
+    add_definitions(-DDISABLE_RUNTIME_CPU_DETECTION)
+endif()
+
+# Force disable LTO if WITH_NATIVE_INSTRUCTIONS is not active
+if(NOT WITH_NATIVE_INSTRUCTIONS)
+    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+    foreach(_cfg_name IN LISTS CMAKE_CONFIGURATION_TYPES)
+        string(TOUPPER "${_cfg_name}" _cfg_name_uc)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_${_cfg_name_uc} OFF)
+    endforeach()
+endif()
+
+# Set architecture alignment requirements
+if(NOT WITH_UNALIGNED)
+    add_definitions(-DNO_UNALIGNED)
+    message(STATUS "Unaligned reads manually disabled")
+endif()
 
 # Apply warning compiler flags
-add_compile_options(${WARNFLAGS_DISABLE})
+if(WITH_MAINTAINER_WARNINGS)
+    add_compile_options(${WARNFLAGS} ${WARNFLAGS_MAINTAINER} ${WARNFLAGS_DISABLE})
+else()
+    add_compile_options(${WARNFLAGS} ${WARNFLAGS_DISABLE})
+endif()
+
+# Set code coverage compiler flags
+if(WITH_CODE_COVERAGE)
+    add_code_coverage()
+endif()
 
 # Replace optimization level 3 added by default with level 2
-if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
-  string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
-    CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+if(NOT WITH_CODE_COVERAGE AND NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
+    string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
+        CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
 endif()
 
 #
@@ -159,32 +368,40 @@ endif()
 #
 check_include_file(arm_acle.h  HAVE_ARM_ACLE_H)
 if(HAVE_ARM_ACLE_H)
-  add_definitions(-DHAVE_ARM_ACLE_H)
+    add_definitions(-DHAVE_ARM_ACLE_H)
 endif()
 check_include_file(sys/auxv.h  HAVE_SYS_AUXV_H)
 if(HAVE_SYS_AUXV_H)
-  add_definitions(-DHAVE_SYS_AUXV_H)
+    add_definitions(-DHAVE_SYS_AUXV_H)
 endif()
 check_include_file(sys/sdt.h   HAVE_SYS_SDT_H)
 if(HAVE_SYS_SDT_H)
-  add_definitions(-DHAVE_SYS_SDT_H)
+    add_definitions(-DHAVE_SYS_SDT_H)
 endif()
 check_include_file(unistd.h    HAVE_UNISTD_H)
 
+#
+# Check for Linux includes
+#
+check_include_file(linux/auxvec.h HAVE_LINUX_AUXVEC_H)
+if(HAVE_LINUX_AUXVEC_H)
+    add_definitions(-DHAVE_LINUX_AUXVEC_H)
+endif()
+
 #
 # Check to see if we have large file support
 #
 set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
 check_type_size(off64_t OFF64_T)
 if(HAVE_OFF64_T)
-  add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
-else()
-  check_type_size(_off64_t _OFF64_T)
-  if(HAVE__OFF64_T)
     add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
-  else()
-    check_type_size(__off64_t __OFF64_T)
-  endif()
+else()
+    check_type_size(_off64_t _OFF64_T)
+    if(HAVE__OFF64_T)
+        add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+    else()
+        check_type_size(__off64_t __OFF64_T)
+    endif()
 endif()
 set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
 
@@ -193,499 +410,676 @@ set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
 #
 check_function_exists(fseeko HAVE_FSEEKO)
 if(NOT HAVE_FSEEKO)
-  add_definitions(-DNO_FSEEKO)
+    add_definitions(-DNO_FSEEKO)
 endif()
 
 check_function_exists(strerror HAVE_STRERROR)
 if(NOT HAVE_STRERROR)
-  add_definitions(-DNO_STRERROR)
+    add_definitions(-DNO_STRERROR)
 endif()
 
 set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L)
 check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
 if(HAVE_POSIX_MEMALIGN)
-  add_definitions(-DHAVE_POSIX_MEMALIGN)
+    add_definitions(-DHAVE_POSIX_MEMALIGN)
 endif()
 set(CMAKE_REQUIRED_DEFINITIONS)
 
 set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1)
 check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC)
 if(HAVE_ALIGNED_ALLOC)
-  add_definitions(-DHAVE_ALIGNED_ALLOC)
+    add_definitions(-DHAVE_ALIGNED_ALLOC)
 endif()
 set(CMAKE_REQUIRED_DEFINITIONS)
 
+if(WITH_SANITIZER STREQUAL "Address")
+    add_address_sanitizer()
+elseif(WITH_SANITIZER STREQUAL "Memory")
+    add_memory_sanitizer()
+elseif(WITH_SANITIZER STREQUAL "Thread")
+    add_thread_sanitizer()
+elseif(WITH_SANITIZER STREQUAL "Undefined")
+    add_undefined_sanitizer()
+endif()
+
+#
+# Check whether compiler supports -fno-semantic-interposition parameter
+#
+check_c_compiler_flag(-fno-semantic-interposition HAVE_NO_INTERPOSITION)
+
 #
 # Check if we can hide zlib internal symbols that are linked between separate source files using hidden
 #
 check_c_source_compiles(
-  "#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
-  int Z_INTERNAL foo;
-  int main() {
-      return 0;
-  }"
-  HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
+    "#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
+    int Z_INTERNAL foo;
+    int main() {
+        return 0;
+    }"
+    HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
 if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
-  add_definitions(-DHAVE_VISIBILITY_HIDDEN)
+    add_definitions(-DHAVE_VISIBILITY_HIDDEN)
 endif()
 
 #
 # Check if we can hide zlib internal symbols that are linked between separate source files using internal
 #
 check_c_source_compiles(
-  "#define Z_INTERNAL __attribute__((visibility (\"internal\")))
-  int Z_INTERNAL foo;
-  int main() {
-      return 0;
-  }"
-  HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
+    "#define Z_INTERNAL __attribute__((visibility (\"internal\")))
+    int Z_INTERNAL foo;
+    int main() {
+        return 0;
+    }"
+    HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
 if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
-  add_definitions(-DHAVE_VISIBILITY_INTERNAL)
+    add_definitions(-DHAVE_VISIBILITY_INTERNAL)
 endif()
 
 #
 # Check for __attribute__((aligned(x))) support in the compiler
 #
 check_c_source_compiles(
-  "int main(void) {
-      __attribute__((aligned(8))) int test = 0;
-      (void)test;
-      return 0;
-  }"
-  HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
+    "int main(void) {
+        __attribute__((aligned(8))) int test = 0;
+        (void)test;
+        return 0;
+    }"
+    HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
 if(HAVE_ATTRIBUTE_ALIGNED)
-  add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
+    add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
+endif()
+
+#
+# Check for __builtin_assume_aligned(x,n) support in the compiler
+#
+check_c_source_compiles(
+    "char *test(char *buffer) {
+        char *abuffer = __builtin_assume_aligned(buffer,64);
+        return abuffer;
+    }
+    int main() {
+        return 0;
+    }"
+    HAVE_BUILTIN_ASSUME_ALIGNED)
+if(HAVE_BUILTIN_ASSUME_ALIGNED)
+    add_definitions(-DHAVE_BUILTIN_ASSUME_ALIGNED)
 endif()
 
 #
 # check for __builtin_ctz() support in the compiler
 #
 check_c_source_compiles(
-  "int main(void) {
-      unsigned int zero = 0;
-      long test = __builtin_ctz(zero);
-      (void)test;
-      return 0;
-  }"
-  HAVE_BUILTIN_CTZ
+    "int main(void) {
+        unsigned int zero = 0;
+        long test = __builtin_ctz(zero);
+        (void)test;
+        return 0;
+    }"
+    HAVE_BUILTIN_CTZ
 )
 if(HAVE_BUILTIN_CTZ)
-  add_definitions(-DHAVE_BUILTIN_CTZ)
+    add_definitions(-DHAVE_BUILTIN_CTZ)
 endif()
 
 #
 # check for __builtin_ctzll() support in the compiler
 #
 check_c_source_compiles(
-  "int main(void) {
-      unsigned int zero = 0;
-      long test = __builtin_ctzll(zero);
-      (void)test;
-      return 0;
-  }"
-  HAVE_BUILTIN_CTZLL
+    "int main(void) {
+        unsigned int zero = 0;
+        long test = __builtin_ctzll(zero);
+        (void)test;
+        return 0;
+    }"
+    HAVE_BUILTIN_CTZLL
 )
 if(HAVE_BUILTIN_CTZLL)
-  add_definitions(-DHAVE_BUILTIN_CTZLL)
+    add_definitions(-DHAVE_BUILTIN_CTZLL)
 endif()
 
 #
 # check for ptrdiff_t support
 #
 check_c_source_compiles(
-  "#include <stddef.h>
-    int main() {
-        ptrdiff_t *a;
-        (void)a;
-        return 0;
-  }"
-  HAVE_PTRDIFF_T
+    "#include <stddef.h>
+     int main() {
+         ptrdiff_t *a;
+         (void)a;
+         return 0;
+    }"
+    HAVE_PTRDIFF_T
 )
 if(NOT HAVE_PTRDIFF_T)
-  set(NEED_PTRDIFF_T 1)
+    set(NEED_PTRDIFF_T 1)
 
-  check_type_size("void *" SIZEOF_DATA_PTR)
-  message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
+    check_type_size("void *" SIZEOF_DATA_PTR)
+    message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
 
-  if(${SIZEOF_DATA_PTR} MATCHES "4")
-    set(PTRDIFF_TYPE "uint32_t")
-  elseif(${SIZEOF_DATA_PTR} MATCHES "8")
-    set(PTRDIFF_TYPE "uint64_t")
-  else()
-    message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
-  endif()
+    if(${SIZEOF_DATA_PTR} MATCHES "4")
+        set(PTRDIFF_TYPE "uint32_t")
+    elseif(${SIZEOF_DATA_PTR} MATCHES "8")
+        set(PTRDIFF_TYPE "uint64_t")
+    else()
+        message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
+    endif()
 endif()
 
+add_compile_options($<$<CONFIG:Debug>:-DZLIB_DEBUG>)
+
 if(MSVC)
-  add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
-  add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
+    set(CMAKE_DEBUG_POSTFIX "d")
+    add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
+    add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
 endif()
 
+if(BASEARCH_X86_FOUND)
+    # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true
+    if("${ARCH}" MATCHES "i[3-6]86")
+        cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF)
+    endif()
+endif()
+
+#
+# Enable deflate_quick at level 1
+#
+if(NOT WITH_NEW_STRATEGIES)
+    add_definitions(-DNO_QUICK_STRATEGY)
+endif()
+#
+# Enable deflate_medium at level 4-6
+#
+if(NOT WITH_NEW_STRATEGIES)
+    add_definitions(-DNO_MEDIUM_STRATEGY)
+endif()
+#
+# Enable inflate compilation options
+#
+if(WITH_INFLATE_STRICT)
+    add_definitions(-DINFLATE_STRICT)
+    message(STATUS "Inflate strict distance checking enabled")
+endif()
+if(WITH_INFLATE_ALLOW_INVALID_DIST)
+    add_definitions(-DINFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR)
+    message(STATUS "Inflate zero data for invalid distances enabled")
+endif()
+#
+# Enable reduced memory configuration
+#
+if(WITH_REDUCED_MEM)
+    add_definitions(-DHASH_SIZE=32768u -DGZBUFSIZE=8192 -DNO_LIT_MEM)
+    message(STATUS "Configured for reduced memory environment")
+endif()
+
+set(GENERIC_ARCHDIR "arch/generic")
+
 set(ZLIB_ARCH_SRCS)
-set(ZLIB_ARCH_HDRS)
-set(ARCHDIR "arch/generic")
-if(BASEARCH_X86_FOUND)
-  set(ARCHDIR "arch/x86")
-endif()
+set(ZLIB_ARCH_HDRS ${GENERIC_ARCHDIR}/generic_functions.h)
+
 if(BASEARCH_ARM_FOUND)
-  set(ARCHDIR "arch/arm")
-endif()
-if(BASEARCH_PPC_FOUND)
-  set(ARCHDIR "arch/power")
-endif()
-if(BASEARCH_RISCV_FOUND)
-  set(ARCHDIR "arch/riscv")
+    set(ARCHDIR "arch/arm")
+elseif(BASEARCH_PPC_FOUND)
+    set(ARCHDIR "arch/power")
+elseif(BASEARCH_RISCV_FOUND)
+    set(ARCHDIR "arch/riscv")
+elseif(BASEARCH_S360_FOUND)
+    set(ARCHDIR "arch/s390")
+elseif(BASEARCH_X86_FOUND)
+    set(ARCHDIR "arch/x86")
+    if(NOT ${ARCH} MATCHES "x86_64")
+        add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"")
+    endif()
+else()
+    set(ARCHDIR ${GENERIC_ARCHDIR})
+    message(STATUS "No optimized architecture: using ${ARCHDIR}")
 endif()
 
-if(NOT CV_DISABLE_OPTIMIZATION)
-  if(BASEARCH_ARM_FOUND)
-    add_definitions(-DARM_FEATURES)
-    if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-      if("${ARCH}" MATCHES "aarch64")
-        check_c_source_compiles(
-          "#include <sys/auxv.h>
-          int main() {
-              return (getauxval(AT_HWCAP) & HWCAP_CRC32);
-          }"
-          ARM_AUXV_HAS_CRC32
-        )
-        if(ARM_AUXV_HAS_CRC32)
-          add_definitions(-DARM_AUXV_HAS_CRC32)
-        else()
-          message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+if(WITH_OPTIM)
+    if(BASEARCH_ARM_FOUND)
+        add_definitions(-DARM_FEATURES)
+        if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+            if("${ARCH}" MATCHES "aarch64")
+                check_c_source_compiles(
+                    "#include <sys/auxv.h>
+                    int main() {
+                        return (getauxval(AT_HWCAP) & HWCAP_CRC32);
+                    }"
+                    ARM_AUXV_HAS_CRC32
+                )
+                if(ARM_AUXV_HAS_CRC32)
+                    add_definitions(-DARM_AUXV_HAS_CRC32)
+                else()
+                   message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+                endif()
+            else()
+                check_c_source_compiles(
+                    "#include <sys/auxv.h>
+                    int main() {
+                        return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+                    }"
+                    ARM_AUXV_HAS_CRC32
+                )
+                if(ARM_AUXV_HAS_CRC32)
+                    add_definitions(-DARM_AUXV_HAS_CRC32)
+                else()
+                    check_c_source_compiles(
+                        "#include <sys/auxv.h>
+                        #include <asm/hwcap.h>
+                        int main() {
+                            return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+                        }"
+                        ARM_HWCAP_HAS_CRC32
+                    )
+                    if (ARM_HWCAP_HAS_CRC32)
+                        add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
+                    else()
+                        message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+                    endif()
+                endif()
+                check_c_source_compiles(
+                    "#include <sys/auxv.h>
+                    int main() {
+                      return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
+                    }"
+                    ARM_AUXV_HAS_NEON
+                )
+                if(ARM_AUXV_HAS_NEON)
+                    add_definitions(-DARM_AUXV_HAS_NEON)
+                else()
+                    check_c_source_compiles(
+                        "#include <sys/auxv.h>
+                        int main() {
+                          return (getauxval(AT_HWCAP) & HWCAP_NEON);
+                        }"
+                        ARM_AUXV_HAS_NEON
+                    )
+                    if (ARM_AUXV_HAS_NEON)
+                        add_definitions(-DARM_AUXV_HAS_NEON)
+                    else()
+                        message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
+                    endif()
+                endif()
+            endif()
         endif()
-      else()
-        check_c_source_compiles(
-          "#include <sys/auxv.h>
-          int main() {
-              return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
-          }"
-          ARM_AUXV_HAS_CRC32
-        )
-        if(ARM_AUXV_HAS_CRC32)
-          add_definitions(-DARM_AUXV_HAS_CRC32)
-        else()
-          check_c_source_compiles(
-            "#include <sys/auxv.h>
-            #include <asm/hwcap.h>
-            int main() {
-                return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
-            }"
-            ARM_HWCAP_HAS_CRC32
-          )
-          if(ARM_HWCAP_HAS_CRC32)
-            add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
-          else()
-            message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
-          endif()
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_functions.h)
+        if(WITH_RUNTIME_CPU_DETECTION)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
         endif()
-        check_c_source_compiles(
-          "#include <sys/auxv.h>
-          int main() {
-            return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
-          }"
-          ARM_AUXV_HAS_NEON
-        )
-        if(ARM_AUXV_HAS_NEON)
-          add_definitions(-DARM_AUXV_HAS_NEON)
+
+        if(WITH_ACLE)
+            check_acle_compiler_flag()
+            if(HAVE_ACLE_FLAG)
+                add_definitions(-DARM_ACLE)
+                set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c)
+                set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
+                list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
+                add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"")
+            else()
+                set(WITH_ACLE OFF)
+            endif()
         else()
-          check_c_source_compiles(
-            "#include <sys/auxv.h>
-            int main() {
-              return (getauxval(AT_HWCAP) & HWCAP_NEON);
-            }"
-            ARM_AUXV_HAS_NEON
-          )
-          if (ARM_AUXV_HAS_NEON)
-            add_definitions(-DARM_AUXV_HAS_NEON)
-          else()
-            message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
-          endif()
+            set(WITH_ACLE OFF)
+        endif()
+        if(WITH_NEON)
+            check_neon_compiler_flag()
+            if(NEON_AVAILABLE)
+                add_definitions(-DARM_NEON)
+                set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
+                    ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
+                list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
+                set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
+                if(MSVC)
+                    add_definitions(-D__ARM_NEON__)
+                endif()
+                add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
+                add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
+                check_neon_ld4_intrinsics()
+                if(NEON_HAS_LD4)
+                    add_definitions(-DARM_NEON_HASLD4)
+                endif()
+            else()
+                set(WITH_NEON OFF)
+            endif()
+        endif()
+        if(WITH_ARMV6)
+            check_armv6_compiler_flag()
+            if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
+                add_definitions(-DARM_SIMD)
+                set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
+                set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
+                list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
+                add_feature_info(ARMV6 1 "Support ARMv6 SIMD instructions in slide_hash, using \"${ARMV6FLAG}\"")
+                if(HAVE_ARMV6_INTRIN)
+                    add_definitions(-DARM_SIMD_INTRIN)
+                endif()
+            else()
+                set(WITH_ARMV6 OFF)
+            endif()
+        else()
+            set(WITH_ARMV6 OFF)
+        endif()
+    elseif(BASEARCH_PPC_FOUND)
+        # Common arch detection code
+        if(WITH_ALTIVEC)
+            check_ppc_intrinsics()
+        endif()
+        if(WITH_POWER8)
+            check_power8_intrinsics()
+        endif()
+        if(WITH_POWER9)
+            check_power9_intrinsics()
+        endif()
+        if(POWER8_NEED_AUXVEC_H OR POWER9_NEED_AUXVEC_H)
+            add_definitions(-DPOWER_NEED_AUXVEC_H)
+        endif()
+        if(HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
+            add_definitions(-DPOWER_FEATURES)
+        endif()
+        if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_functions.h)
+            if(WITH_RUNTIME_CPU_DETECTION)
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
+                list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
+            endif()
+        endif()
+        # VMX specific options and files
+        if(WITH_ALTIVEC)
+            if(HAVE_VMX)
+                add_definitions(-DPPC_FEATURES)
+                if(HAVE_ALTIVEC)
+                    add_definitions(-DPPC_VMX)
+                    set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
+                    list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
+                    add_feature_info(ALTIVEC 1 "Support the AltiVec instruction set, using \"-maltivec\"")
+                    set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
+                else()
+                    set(WITH_ALTIVEC OFF)
+                endif()
+            endif()
+        endif()
+        # Power8 specific options and files
+        if(WITH_POWER8)
+            if(HAVE_POWER8_INTRIN)
+                add_definitions(-DPOWER8_VSX)
+                set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
+                if("${ARCH}" MATCHES "powerpc64(le)?")
+                    add_definitions(-DPOWER8_VSX_CRC32)
+                    list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
+                endif()
+                list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
+                set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_POWER8 OFF)
+            endif()
+        endif()
+        # Power9 specific options and files
+        if(WITH_POWER9)
+            if(HAVE_POWER9_INTRIN)
+                add_definitions(-DPOWER9)
+                set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
+                list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
+                set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_POWER9 OFF)
+            endif()
+        endif()
+    elseif(BASEARCH_RISCV_FOUND)
+        if(WITH_RVV)
+            check_rvv_intrinsics()
+            if(HAVE_RVV_INTRIN)
+                add_definitions(-DRISCV_FEATURES)
+                add_definitions(-DRISCV_RVV)
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_functions.h)
+                if(WITH_RUNTIME_CPU_DETECTION)
+                    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
+                    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
+                endif()
+                # FIXME: we will not set compile flags for riscv_features.c when
+                # the kernels update hwcap or hwprobe for riscv
+                set(RVV_SRCS ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
+                if(WITH_RUNTIME_CPU_DETECTION)
+                    list(APPEND RVV_SRCS ${ARCHDIR}/riscv_features.c)
+                endif()
+                list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
+                set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_RVV OFF)
+            endif()
+        endif()
+    elseif(BASEARCH_S360_FOUND)
+        check_s390_intrinsics()
+        if(HAVE_S390_INTRIN)
+            add_definitions(-DS390_FEATURES)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/s390_functions.h)
+            if(WITH_RUNTIME_CPU_DETECTION)
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/s390_features.h)
+                list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/s390_features.c)
+            endif()
+        endif()
+        if(WITH_DFLTCC_DEFLATE)
+            add_definitions(-DS390_DFLTCC_DEFLATE)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_deflate.c)
+        endif()
+        if(WITH_DFLTCC_INFLATE)
+            add_definitions(-DS390_DFLTCC_INFLATE)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_inflate.c)
+        endif()
+        if(WITH_CRC32_VX)
+            check_vgfma_intrinsics()
+            if(HAVE_VGFMA_INTRIN)
+                add_definitions(-DS390_CRC32_VX)
+                set(CRC32_VX_SRCS ${ARCHDIR}/crc32-vx.c)
+                list(APPEND ZLIB_ARCH_SRCS ${CRC32_VX_SRCS})
+                set_property(SOURCE ${CRC32_VX_SRCS} PROPERTY COMPILE_FLAGS "${VGFMAFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_CRC32_VX OFF)
+            endif()
+        endif()
+    elseif(BASEARCH_X86_FOUND)
+        add_definitions(-DX86_FEATURES)
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_functions.h)
+        if(WITH_RUNTIME_CPU_DETECTION)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
         endif()
-      endif()
-    endif()
-    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
-    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
-    if(WITH_ACLE)
-      check_acle_compiler_flag()
-      if(HAVE_ACLE_FLAG)
-        add_definitions(-DARM_ACLE)
-        set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
-        set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
-        list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
-      else()
-        set(WITH_ACLE OFF)
-      endif()
-    else()
-      set(WITH_ACLE OFF)
-    endif()
-    if(WITH_NEON)
-      check_neon_compiler_flag()
-      if(NEON_AVAILABLE)
-        add_definitions(-DARM_NEON)
-        set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
-          ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
-        list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
-        set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
         if(MSVC)
-          add_definitions(-D__ARM_NEON__)
+            list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
         endif()
-        check_neon_ld4_intrinsics()
-        if(NEON_HAS_LD4)
-          add_definitions(-DARM_NEON_HASLD4)
+        check_xsave_intrinsics()
+        if(HAVE_XSAVE_INTRIN)
+            add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"")
+            if(WITH_RUNTIME_CPU_DETECTION)
+                set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
+            endif()
+            if(NOT (CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8.2))
+                add_definitions(-DX86_HAVE_XSAVE_INTRIN)
+            endif()
         endif()
-      else()
-        set(WITH_NEON OFF)
-      endif()
-    endif()
-    if(WITH_ARMV6)
-      check_armv6_compiler_flag()
-      if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
-        add_definitions(-DARM_SIMD)
-        set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
-        set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
-        list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
-        if(HAVE_ARMV6_INTRIN)
-          add_definitions(-DARM_SIMD_INTRIN)
+        if(WITH_SSE2)
+            check_sse2_intrinsics()
+            if(HAVE_SSE2_INTRIN)
+                add_definitions(-DX86_SSE2)
+                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
+                if(NOT ${ARCH} MATCHES "x86_64")
+                    set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
+                    add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
+                    if(FORCE_SSE2)
+                        add_definitions(-DX86_NOCHECK_SSE2)
+                    endif()
+                endif()
+            else()
+                set(WITH_SSE2 OFF)
+            endif()
         endif()
-      else()
-        set(WITH_ARMV6 OFF)
-      endif()
-    else()
-      set(WITH_ARMV6 OFF)
-    endif()
-  endif()
-  if(BASEARCH_PPC_FOUND)
-    # Common arch detection code
-    if(WITH_ALTIVEC)
-      check_ppc_intrinsics()
-    endif()
-    if(WITH_POWER8)
-      check_power8_intrinsics()
-    endif()
-    if(WITH_POWER9)
-      check_power9_intrinsics()
-    endif()
-    if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
-      add_definitions(-DPOWER_FEATURES)
-      list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
-      list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
-    endif()
-    # VMX specific options and files
-    if(WITH_ALTIVEC)
-      if(HAVE_VMX)
-        add_definitions(-DPPC_FEATURES)
-        if(HAVE_ALTIVEC)
-          add_definitions(-DPPC_VMX)
-          set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
-          list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
-          set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
-        else()
-          set(WITH_ALTIVEC OFF)
+        if(WITH_SSSE3)
+            check_ssse3_intrinsics()
+            if(HAVE_SSSE3_INTRIN AND WITH_SSE2)
+                add_definitions(-DX86_SSSE3)
+                set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
+                add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
+                set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_SSSE3 OFF)
+            endif()
         endif()
-      endif()
-    endif()
-    # Power8 specific options and files
-    if(WITH_POWER8)
-      if(HAVE_POWER8_INTRIN)
-        add_definitions(-DPOWER8_VSX)
-        set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
-        if("${ARCH}" MATCHES "powerpc64(le)?")
-          add_definitions(-DPOWER8_VSX_CRC32)
-          list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
+        if(WITH_SSE42)
+            check_sse42_intrinsics()
+            if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
+                add_definitions(-DX86_SSE42)
+                set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
+                add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
+                set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_SSE42 OFF)
+            endif()
         endif()
-        list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
-        set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_POWER8 OFF)
-      endif()
-    endif()
-    # Power9 specific options and files
-    if(WITH_POWER9)
-      if(HAVE_POWER9_INTRIN)
-        add_definitions(-DPOWER9)
-        set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
-        list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
-        set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_POWER9 OFF)
-      endif()
-    endif()
-  endif()
-  if(BASEARCH_RISCV_FOUND)
-    if(WITH_RVV)
-      check_rvv_intrinsics()
-      if(HAVE_RVV_INTRIN)
-        add_definitions(-DRISCV_FEATURES)
-        add_definitions(-DRISCV_RVV)
-        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
-        list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
-        # FIXME: we will not set compile flags for riscv_features.c when
-        # the kernels update hwcap or hwprobe for riscv
-        set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
-        list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
-        set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_RVV OFF)
-      endif()
-    endif()
-  endif()
-  if(BASEARCH_X86_FOUND)
-    add_definitions(-DX86_FEATURES)
-    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
-    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
-    if(MSVC)
-      list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
-    endif()
-    if(WITH_AVX2)
-      check_avx2_intrinsics()
-      if(HAVE_AVX2_INTRIN)
-        add_definitions(-DX86_AVX2)
-        set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
-        list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
-        list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
-        list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
-        list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
-        set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_AVX2 OFF)
-      endif()
-    endif()
-    if(WITH_AVX512)
-      check_avx512_intrinsics()
-      if(HAVE_AVX512_INTRIN)
-        add_definitions(-DX86_AVX512)
-        list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
-        list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
-        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
-        if(HAVE_MASK_INTRIN)
-          add_definitions(-DX86_MASK_INTRIN)
+        if(WITH_PCLMULQDQ)
+            check_pclmulqdq_intrinsics()
+            if(HAVE_PCLMULQDQ_INTRIN AND WITH_SSE42)
+                add_definitions(-DX86_PCLMULQDQ_CRC)
+                set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
+                add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSE42FLAG} ${PCLMULFLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
+                set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_PCLMULQDQ OFF)
+            endif()
         endif()
-        set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_AVX512 OFF)
-      endif()
-    endif()
-    if(WITH_AVX512VNNI)
-      check_avx512vnni_intrinsics()
-      if(HAVE_AVX512VNNI_INTRIN)
-        add_definitions(-DX86_AVX512VNNI)
-        list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
-        list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
-        set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_AVX512VNNI OFF)
-      endif()
-    endif()
-    if(WITH_SSE42)
-      check_sse42_intrinsics()
-      if(HAVE_SSE42_INTRIN)
-        add_definitions(-DX86_SSE42)
-        set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
-        list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
-        set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_SSE42 OFF)
-      endif()
-    endif()
-    if(WITH_SSE2)
-      check_sse2_intrinsics()
-      if(HAVE_SSE2_INTRIN)
-        add_definitions(-DX86_SSE2)
-        set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
-        list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
-        if(NOT ${ARCH} MATCHES "x86_64")
-          set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
-          add_definitions(-DX86_NOCHECK_SSE2)
+        if(WITH_AVX2)
+            check_avx2_intrinsics()
+            if(HAVE_AVX2_INTRIN AND WITH_SSE42)
+                add_definitions(-DX86_AVX2)
+                set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
+                add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+                list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
+                add_feature_info(AVX2_CHUNKSET 1 "Support AVX2 optimized chunkset, using \"${AVX2FLAG}\"")
+                list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
+                add_feature_info(AVX2_COMPARE256 1 "Support AVX2 optimized compare256, using \"${AVX2FLAG}\"")
+                list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
+                add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
+                set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_AVX2 OFF)
+            endif()
         endif()
-      else()
-        set(WITH_SSE2 OFF)
-      endif()
-    endif()
-    if(WITH_SSSE3)
-      check_ssse3_intrinsics()
-      if(HAVE_SSSE3_INTRIN)
-        add_definitions(-DX86_SSSE3)
-        set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
-        list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
-        set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
-      else()
-        set(WITH_SSSE3 OFF)
-      endif()
-    endif()
-    if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
-      check_pclmulqdq_intrinsics()
-      if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
-        add_definitions(-DX86_PCLMULQDQ_CRC)
-        set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
-        list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
-        set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
-
-        if(WITH_VPCLMULQDQ AND WITH_AVX512)
-          check_vpclmulqdq_intrinsics()
-          if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
-            add_definitions(-DX86_VPCLMULQDQ_CRC)
-            set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
-            list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
-            set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
-          else()
-            set(WITH_VPCLMULQDQ OFF)
-          endif()
-        else()
-          set(WITH_VPCLMULQDQ OFF)
+        if(WITH_AVX512)
+            check_avx512_intrinsics()
+            if(HAVE_AVX512_INTRIN AND WITH_AVX2)
+                add_definitions(-DX86_AVX512)
+                list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+                add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+                set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_AVX512 OFF)
+            endif()
+        endif()
+        if(WITH_AVX512VNNI)
+            check_avx512vnni_intrinsics()
+            if(HAVE_AVX512VNNI_INTRIN AND WITH_AVX2)
+                add_definitions(-DX86_AVX512VNNI)
+                add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"")
+                list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+                set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_AVX512VNNI OFF)
+            endif()
+        endif()
+        if(WITH_VPCLMULQDQ)
+            check_vpclmulqdq_intrinsics()
+            if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512)
+                add_definitions(-DX86_VPCLMULQDQ_CRC)
+                set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
+                add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"")
+                list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+                set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_VPCLMULQDQ OFF)
+            endif()
         endif()
-      else()
-        set(WITH_PCLMULQDQ OFF)
-        set(WITH_VPCLMULQDQ OFF)
-      endif()
-    else()
-      set(WITH_PCLMULQDQ OFF)
-      set(WITH_VPCLMULQDQ OFF)
     endif()
-    check_xsave_intrinsics()
-    if(HAVE_XSAVE_INTRIN)
-      set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
-    endif()
-  endif()
 endif()
 
+message(STATUS "Architecture-specific source files: ${ZLIB_ARCH_SRCS}")
+
 #============================================================================
 # zconf.h
 #============================================================================
 
 macro(generate_cmakein input output)
-  file(REMOVE ${output})
-  file(STRINGS ${input} _lines)
-  foreach(_line IN LISTS _lines)
-    string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
-    string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
-    if(NEED_PTRDIFF_T)
-      string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
-    endif()
-    file(APPEND ${output} "${_line}\n")
-  endforeach()
+    file(REMOVE ${output})
+    file(STRINGS ${input} _lines)
+    foreach(_line IN LISTS _lines)
+        string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
+        string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
+        if(NEED_PTRDIFF_T)
+            string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
+        endif()
+        file(APPEND ${output} "${_line}\n")
+    endforeach()
 endmacro(generate_cmakein)
 
-generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein )
+generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein )
+
+if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+    # If we're doing an out of source build and the user has a zconf.h
+    # in their source tree...
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h)
+        message(STATUS "Renaming")
+        message(STATUS "    ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h")
+        message(STATUS "to 'zconf${SUFFIX}.h.included' because this file is included with zlib")
+        message(STATUS "but CMake generates it automatically in the build directory.")
+        file(RENAME ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.included)
+    endif()
+
+    # If we're doing an out of source build and the user has a zconf.h.cmakein
+    # in their source tree...
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein)
+        message(STATUS "Renaming")
+        message(STATUS "    ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein")
+        message(STATUS "to 'zconf${SUFFIX}.h.cmakeincluded' because this file is included with zlib")
+        message(STATUS "but CMake generates it automatically in the build directory.")
+        file(RENAME ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakeincluded)
+    endif()
+endif()
+
+# The user is allowed (but discouraged) to set absolute CMAKE_INSTALL_*DIR paths.
+# If they do, we copy these non-relocatable paths into the pkg-config file.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(PC_INC_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+    set(PC_INC_INSTALL_DIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+    set(PC_LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
+else()
+    set(PC_LIB_INSTALL_DIR "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+endif()
 
 #============================================================================
 # zlib
 #============================================================================
 
 set(ZLIB_PUBLIC_HDRS
-    ${CMAKE_CURRENT_BINARY_DIR}/zconf.h
-    ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h
-    ${CMAKE_CURRENT_BINARY_DIR}/zlib.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
 )
 set(ZLIB_PRIVATE_HDRS
     adler32_p.h
     chunkset_tpl.h
     compare256_rle.h
-    cpu_features.h
+    arch_functions.h
     crc32_braid_p.h
     crc32_braid_comb_p.h
     crc32_braid_tbl.h
-    crc32_fold.h
     deflate.h
     deflate_p.h
     functable.h
@@ -704,15 +1098,17 @@ set(ZLIB_PRIVATE_HDRS
     zutil.h
 )
 set(ZLIB_SRCS
+    arch/generic/adler32_c.c
+    arch/generic/adler32_fold_c.c
+    arch/generic/chunkset_c.c
+    arch/generic/compare256_c.c
+    arch/generic/crc32_braid_c.c
+    arch/generic/crc32_fold_c.c
+    arch/generic/slide_hash_c.c
     adler32.c
-    adler32_fold.c
-    chunkset.c
-    compare256.c
     compress.c
-    cpu_features.c
-    crc32_braid.c
+    crc32.c
     crc32_braid_comb.c
-    crc32_fold.c
     deflate.c
     deflate_fast.c
     deflate_huff.c
@@ -727,12 +1123,16 @@ set(ZLIB_SRCS
     inftrees.c
     insert_string.c
     insert_string_roll.c
-    slide_hash.c
     trees.c
     uncompr.c
     zutil.c
 )
 
+if(WITH_RUNTIME_CPU_DETECTION)
+    list(APPEND ZLIB_PRIVATE_HDRS cpu_features.h)
+    list(APPEND ZLIB_SRCS cpu_features.c)
+endif()
+
 set(ZLIB_GZFILE_PRIVATE_HDRS
     gzguts.h
 )
@@ -743,13 +1143,124 @@ set(ZLIB_GZFILE_SRCS
 )
 
 set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
-list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
+if(WITH_GZFILEOP)
+    list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
+endif()
 
-add_library(zlib STATIC ${ZLIB_ALL_SRCS})
+if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
+    set(ZLIB_DLL_SRCS win32/zlib${SUFFIX}1.rc)
+endif()
 
-target_include_directories(zlib PUBLIC
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>"
-  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
+if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS)
+    add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS})
+    add_library(zlibstatic STATIC ${ZLIB_ALL_SRCS})
+
+    set(ZLIB_INSTALL_LIBRARIES zlib zlibstatic)
+else()
+
+    if(ZLIB_BUILD_SHARED_LIBS)
+        add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS})
+        target_sources(zlib PRIVATE ${ZLIB_DLL_SRCS})
+    else()
+        add_library(zlib STATIC ${ZLIB_ALL_SRCS})
+        add_library(zlibstatic ALIAS zlib)
+    endif()
+
+    set(ZLIB_INSTALL_LIBRARIES zlib)
+endif()
+
+# INFO: Mimics official zlib CMake target
+# Generates ZLIB.cmake in case ZLIB_COMPAT=ON and always exports the CMake target ZLIB::ZLIB
+# In case ZLIB_COMPAT=OFF, the CMake target and file follows zlib-ng naming convention
+if (ZLIB_COMPAT)
+    if (TARGET zlib)
+        set_target_properties(zlib PROPERTIES EXPORT_NAME ZLIB)
+    else()
+        set_target_properties(zlibstatic PROPERTIES EXPORT_NAME ZLIB)
+    endif()
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${ARCHDIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/arch/generic)
+
+foreach(ZLIB_INSTALL_LIBRARY ${ZLIB_INSTALL_LIBRARIES})
+    if(NOT ZLIB_COMPAT)
+        target_compile_definitions(${ZLIB_INSTALL_LIBRARY} PUBLIC ZLIBNG_NATIVE_API)
+    endif()
+    target_include_directories(${ZLIB_INSTALL_LIBRARY} PUBLIC
+        "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}$<SEMICOLON>${CMAKE_CURRENT_SOURCE_DIR}>"
+        "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
+endforeach()
+
+if(WIN32)
+    # Shared library
+    if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
+        set_target_properties(zlib PROPERTIES OUTPUT_NAME zlib${SUFFIX})
+    endif()
+    # Static library
+    if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS)
+        if(MSVC)
+            set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX})
+        else()
+            set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z${SUFFIX})
+        endif()
+    elseif(NOT ZLIB_BUILD_SHARED_LIBS)
+        if(MSVC)
+            set_target_properties(zlib PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX})
+        else()
+            set_target_properties(zlib PROPERTIES OUTPUT_NAME z${SUFFIX})
+        endif()
+    endif()
+else()
+    # On unix-like platforms the library is almost always called libz
+    set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES OUTPUT_NAME z${SUFFIX})
+endif()
+
+if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
+    set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
+
+    if(ZLIB_COMPAT)
+        set_target_properties(zlib PROPERTIES SOVERSION 1)
+    else()
+        set_target_properties(zlib PROPERTIES SOVERSION 2)
+    endif()
+
+    if(NOT CYGWIN)
+        # This property causes shared libraries on Linux to have the full version
+        # encoded into their final filename.  We disable this on Cygwin because
+        # it causes cygz-${ZLIB_FULL_VERSION}.dll to be created when cygz.dll
+        # seems to be the default.
+        #
+        # This has no effect with MSVC, on that platform the version info for
+        # the DLL comes from the resource file win32/zlib1.rc
+        set_target_properties(zlib PROPERTIES VERSION ${ZLIB_FULL_VERSION})
+    endif()
+
+    if(UNIX)
+        if(HAVE_NO_INTERPOSITION)
+            set_target_properties(zlib PROPERTIES COMPILE_FLAGS "-fno-semantic-interposition")
+        endif()
+        if(NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL AIX)
+            if(NOT ZLIB_COMPAT)
+                add_definitions(-DHAVE_SYMVER)
+            endif()
+            set_target_properties(zlib PROPERTIES LINK_FLAGS
+                "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.map\"")
+        endif()
+    endif()
+    if(MSYS)
+        # Suppress version number from shared library name
+        set(CMAKE_SHARED_LIBRARY_NAME_WITH_VERSION 0)
+    elseif(WIN32)
+        # Creates zlib1.dll when building shared library version
+        if(ZLIB_COMPAT)
+            set_target_properties(zlib PROPERTIES SUFFIX "1.dll")
+        else()
+            set_target_properties(zlib PROPERTIES SUFFIX "2.dll")
+        endif()
+    endif()
+endif()
 
 if(HAVE_UNISTD_H)
   SET(ZCONF_UNISTD_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
@@ -757,40 +1268,143 @@ else()
   SET(ZCONF_UNISTD_LINE "#if 0    /* was set to #if 0 by configure/cmake/etc */")
 endif()
 if(NEED_PTRDIFF_T)
-  SET(ZCONF_PTRDIFF_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
+    SET(ZCONF_PTRDIFF_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
 else()
-  SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */")
+    SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */")
 endif()
 
-configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein
-  ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY)
+set(ZLIB_PC ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.pc)
+if(WITH_GZFILEOP)
+    set(PKG_CONFIG_CFLAGS "-DWITH_GZFILEOP")
+endif()
+configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein
+    ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.h.in
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
-  ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
+    ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
-  ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
+if (NOT ZLIB_SYMBOL_PREFIX STREQUAL "")
+    add_feature_info(ZLIB_SYMBOL_PREFIX ON "Publicly exported symbols have a custom prefix")
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling${SUFFIX}.h.in
+        ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h @ONLY)
+else()
+    add_feature_info(ZLIB_SYMBOL_PREFIX OFF "Publicly exported symbols DO NOT have a custom prefix")
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
+        ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
+endif()
+# add_definitions(-DZLIB_SYMBOL_PREFIX=${ZLIB_SYMBOL_PREFIX}) # not needed
 
-ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes
-  -Wundef
-  -Wmissing-declarations
-)
 
-set_target_properties(${ZLIB_LIBRARY} PROPERTIES
-  OUTPUT_NAME ${ZLIB_LIBRARY}
-  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-  COMPILE_PDB_NAME ${ZLIB_LIBRARY}
-  COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}"
-  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
-)
+if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL)
+    install(TARGETS ${ZLIB_INSTALL_LIBRARIES}
+        EXPORT ${EXPORT_NAME}
+        RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+        ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+        LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+endif()
+if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zlib${SUFFIX}.h)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zlib_name_mangling${SUFFIX}.h)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zconf${SUFFIX}.h)
+endif()
+if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL)
+    install(FILES ${ZLIB_PC} DESTINATION "${PKGCONFIG_INSTALL_DIR}")
+    install(EXPORT ${EXPORT_NAME}
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME}"
+        NAMESPACE ${EXPORT_NAME}::)
+    # Use GNU-style variable names
+    set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})
+    set(LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR})
+    if (ZLIB_COMPAT)
+       set(PACKAGE_CONFIGNAME zlib)
+       set(PACKAGE_VERSION ${ZLIB_HEADER_VERSION})
+    else()
+       set(PACKAGE_CONFIGNAME zlib-ng)
+       set(PACKAGE_VERSION ${ZLIBNG_HEADER_VERSION})
+    endif()
+endif()
+
+#============================================================================
+# Example binaries
+#============================================================================
+
+if(ZLIB_ENABLE_TESTS)
+    enable_testing()
+
+    if(ZLIB_BUILD_SHARED_LIBS)
+        if(ZLIBNG_ENABLE_TESTS)
+            message(STATUS "Disabling zlib-ng tests because shared libraries are enabled")
+            set(ZLIBNG_ENABLE_TESTS OFF)
+        endif()
+
+        if(WITH_BENCHMARKS OR WITH_BENCHMARK_APPS)
+            message(STATUS "Disabling benchmarks because shared libraries are enabled")
+            set(WITH_BENCHMARKS OFF)
+            set(WITH_BENCHMARK_APPS OFF)
+        endif()
+    endif()
+
+    add_subdirectory(test)
+endif()
+
+add_feature_info(WITH_GZFILEOP WITH_GZFILEOP "Compile with support for gzFile related functions")
+add_feature_info(ZLIB_COMPAT ZLIB_COMPAT "Compile with zlib compatible API")
+add_feature_info(ZLIB_ENABLE_TESTS ZLIB_ENABLE_TESTS "Build test binaries")
+add_feature_info(ZLIBNG_ENABLE_TESTS ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API")
+add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support")
+add_feature_info(WITH_GTEST WITH_GTEST "Build gtest_zlib")
+add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz")
+add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks")
+add_feature_info(WITH_BENCHMARK_APPS WITH_BENCHMARK_APPS "Build application benchmarks")
+add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation")
+add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies")
+add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS
+    "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)")
+add_feature_info(WITH_RUNTIME_CPU_DETECTION WITH_RUNTIME_CPU_DETECTION "Build with runtime CPU detection")
+add_feature_info(WITH_MAINTAINER_WARNINGS WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings")
+add_feature_info(WITH_CODE_COVERAGE WITH_CODE_COVERAGE "Enable code coverage reporting")
+add_feature_info(WITH_INFLATE_STRICT WITH_INFLATE_STRICT "Build with strict inflate distance checking")
+add_feature_info(WITH_INFLATE_ALLOW_INVALID_DIST WITH_INFLATE_ALLOW_INVALID_DIST "Build with zero fill for inflate invalid distances")
+
+if(BASEARCH_ARM_FOUND)
+    add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE")
+    add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
+    add_feature_info(WITH_ARMV6 WITH_ARMV6 "Build with ARMv6 SIMD")
+elseif(BASEARCH_PPC_FOUND)
+    add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
+    add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
+    add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9")
+elseif(BASEARCH_RISCV_FOUND)
+    add_feature_info(WITH_RVV WITH_RVV "Build with RVV intrinsics")
+elseif(BASEARCH_S360_FOUND)
+    add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
+    add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
+    add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z")
+elseif(BASEARCH_X86_FOUND)
+    add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2")
+    add_feature_info(WITH_AVX512 WITH_AVX512 "Build with AVX512")
+    add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
+    add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
+    add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
+    add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
+    add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
+    add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
+endif()
+
+add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install")
+
+FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES)
 
 if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty")
+  set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES FOLDER "3rdparty")
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  ocv_install_target(${ZLIB_INSTALL_LIBRARIES} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
 
-ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md)
+ocv_install_3rdparty_licenses(${ZLIB_INSTALL_LIBRARIES} LICENSE.md)
diff --git a/3rdparty/zlib-ng/LICENSE.md b/3rdparty/zlib-ng/LICENSE.md
index adb48d4729..e866d7ac18 100644
--- a/3rdparty/zlib-ng/LICENSE.md
+++ b/3rdparty/zlib-ng/LICENSE.md
@@ -1,4 +1,4 @@
-(C) 1995-2013 Jean-loup Gailly and Mark Adler
+(C) 1995-2024 Jean-loup Gailly and Mark Adler
 
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
diff --git a/3rdparty/zlib-ng/README.md b/3rdparty/zlib-ng/README.md
index 4f9fe09c69..411621b52f 100644
--- a/3rdparty/zlib-ng/README.md
+++ b/3rdparty/zlib-ng/README.md
@@ -21,7 +21,6 @@ Features
 * Support for CPU intrinsics when available
   * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
-  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
   * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
   * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@@ -95,20 +94,21 @@ make test
 Build Options
 -------------
 
-| CMake                    | configure                | Description                                                                           | Default |
-|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
-| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
-| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
-| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
-| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
-| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
-| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
-| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
-| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
-| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
-| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
-| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
-| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+| CMake                      | configure                | Description                                                                         | Default |
+|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT                | --zlib-compat            | Compile with zlib compatible API                                                    | OFF     |
+| ZLIB_ENABLE_TESTS          |                          | Build test binaries                                                                 | ON      |
+| WITH_GZFILEOP              | --without-gzfileops      | Compile with support for gzFile related functions                                   | ON      |
+| WITH_OPTIM                 | --without-optimizations  | Build with optimisations                                                            | ON      |
+| WITH_NEW_STRATEGIES        | --without-new-strategies | Use new strategies                                                                  | ON      |
+| WITH_NATIVE_INSTRUCTIONS   |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF     |
+| WITH_RUNTIME_CPU_DETECTION |                          | Compiles with runtime CPU detection                                                 | ON      |
+| WITH_SANITIZER             |                          | Build with sanitizer (memory, address, undefined)                                   | OFF     |
+| WITH_GTEST                 |                          | Build gtest_zlib                                                                    | ON      |
+| WITH_FUZZERS               |                          | Build test/fuzz                                                                     | OFF     |
+| WITH_BENCHMARKS            |                          | Build test/benchmarks                                                               | OFF     |
+| WITH_MAINTAINER_WARNINGS   |                          | Build with project maintainer warnings                                              | OFF     |
+| WITH_CODE_COVERAGE         |                          | Enable code coverage reporting                                                      | OFF     |
 
 
 Install
diff --git a/3rdparty/zlib-ng/adler32.c b/3rdparty/zlib-ng/adler32.c
index 95ac13c304..1a643ed53b 100644
--- a/3rdparty/zlib-ng/adler32.c
+++ b/3rdparty/zlib-ng/adler32.c
@@ -7,70 +7,24 @@
 #include "functable.h"
 #include "adler32_p.h"
 
-/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
-    uint32_t sum2;
-    unsigned n;
-
-    /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
-
-    /* do length NMAX blocks -- requires just one modulo operation */
-    while (len >= NMAX) {
-        len -= NMAX;
-#ifdef UNROLL_MORE
-        n = NMAX / 16;          /* NMAX is divisible by 16 */
-#else
-        n = NMAX / 8;           /* NMAX is divisible by 8 */
-#endif
-        do {
-#ifdef UNROLL_MORE
-            DO16(adler, sum2, buf);          /* 16 sums unrolled */
-            buf += 16;
-#else
-            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
-            buf += 8;
-#endif
-        } while (--n);
-        adler %= BASE;
-        sum2 %= BASE;
-    }
-
-    /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_len_64(adler, buf, len, sum2);
-}
-
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif
 
 /* ========================================================================= */
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif
 
diff --git a/3rdparty/zlib-ng/adler32_fold.h b/3rdparty/zlib-ng/adler32_fold.h
deleted file mode 100644
index 20aa1c7400..0000000000
--- a/3rdparty/zlib-ng/adler32_fold.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* adler32_fold.h -- adler32 folding interface
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_FOLD_H_
-#define ADLER32_FOLD_H_
-
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-
-#endif
diff --git a/3rdparty/zlib-ng/arch/.gitignore b/3rdparty/zlib-ng/arch/.gitignore
deleted file mode 100644
index 2c3af0a08c..0000000000
--- a/3rdparty/zlib-ng/arch/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# ignore Makefiles; they're all automatically generated
-Makefile
diff --git a/3rdparty/zlib-ng/arch/arm/Makefile.in b/3rdparty/zlib-ng/arch/arm/Makefile.in
index 9d05b00b54..b6f0aaf211 100644
--- a/3rdparty/zlib-ng/arch/arm/Makefile.in
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@@ -25,7 +25,6 @@ all: \
 	crc32_acle.o crc32_acle.lo \
 	slide_hash_neon.o slide_hash_neon.lo \
 	slide_hash_armv6.o slide_hash_armv6.lo \
-	insert_string_acle.o insert_string_acle.lo
 
 adler32_neon.o:
 	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@@ -69,12 +68,6 @@ slide_hash_armv6.o:
 slide_hash_armv6.lo:
 	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
 
-insert_string_acle.o:
-	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
-insert_string_acle.lo:
-	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
diff --git a/3rdparty/zlib-ng/arch/arm/adler32_neon.c b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
index f1c43ff047..8e46b38017 100644
--- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@@ -7,8 +7,8 @@
  */
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 
 static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     static const uint16_t ALIGNED_(16) taps[64] = {
diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.c b/3rdparty/zlib-ng/arch/arm/arm_features.c
index a0e070ba95..d0d49764f4 100644
--- a/3rdparty/zlib-ng/arch/arm/arm_features.c
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@@ -1,4 +1,4 @@
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "arm_features.h"
 
 #if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@@ -11,6 +11,11 @@
 #  ifndef ID_AA64ISAR0_CRC32_VAL
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
 #elif defined(__APPLE__)
 #  if !defined(_DARWIN_C_SOURCE)
 #    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@@ -30,6 +35,16 @@ static int arm_has_crc32() {
 #elif defined(__FreeBSD__) && defined(__aarch64__)
     return getenv("QEMU_EMULATING") == NULL
       && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+    int hascrc32 = 0;
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
+          hascrc32 = 1;
+    }
+    return hascrc32;
 #elif defined(__APPLE__)
     int hascrc32;
     size_t size = sizeof(hascrc32);
diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.h b/3rdparty/zlib-ng/arch/arm/arm_features.h
index eca078e310..d968e02fbb 100644
--- a/3rdparty/zlib-ng/arch/arm/arm_features.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@@ -2,8 +2,8 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef ARM_H_
-#define ARM_H_
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_
 
 struct arm_cpu_features {
     int has_simd;
@@ -13,4 +13,4 @@ struct arm_cpu_features {
 
 void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
 
-#endif /* ARM_H_ */
+#endif /* ARM_FEATURES_H_ */
diff --git a/3rdparty/zlib-ng/arch/arm/arm_functions.h b/3rdparty/zlib-ng/arch/arm/arm_functions.h
new file mode 100644
index 0000000000..61c682710a
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/arm_functions.h
@@ -0,0 +1,65 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_neon(void);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZLL
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#  endif
+void slide_hash_neon(deflate_state *s);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef ARM_ACLE
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_chunksize
+#    define native_chunksize chunksize_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#    ifdef HAVE_BUILTIN_CTZLL
+#      undef native_compare256
+#      define native_compare256 compare256_neon
+#      undef native_longest_match
+#      define native_longest_match longest_match_neon
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_neon
+#    endif
+#  endif
+// ARM - ACLE
+#  if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
+#    undef native_crc32
+#    define native_crc32 crc32_acle
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
diff --git a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
index f9a444b068..1c49ef5612 100644
--- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@@ -4,8 +4,8 @@
 
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../generic/chunk_permute_table.h"
+#include "zbuild.h"
+#include "arch/generic/chunk_permute_table.h"
 
 typedef uint8x16_t chunk_t;
 
diff --git a/3rdparty/zlib-ng/arch/arm/compare256_neon.c b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
index 7daeba411e..87d14c89c0 100644
--- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@@ -3,8 +3,9 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"
 
 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
diff --git a/3rdparty/zlib-ng/arch/arm/crc32_acle.c b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
index ac7d6ff66b..116bcab1c2 100644
--- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@@ -7,7 +7,7 @@
 
 #ifdef ARM_ACLE
 #include "acle_intrins.h"
-#include "../../zbuild.h"
+#include "zbuild.h"
 
 Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
     Z_REGISTER uint32_t c;
diff --git a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
deleted file mode 100644
index aa8385c712..0000000000
--- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef ARM_ACLE
-#include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val) \
-    h = __crc32w(0, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
-#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
-#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
-
-#include "../../insert_string_tpl.h"
-#endif
diff --git a/3rdparty/zlib-ng/arch/arm/neon_intrins.h b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
index 51df77dbe6..a9e99ec88a 100644
--- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@@ -25,6 +25,13 @@
     out.val[3] = vqsubq_u16(a.val[3], b); \
 } while (0)
 
+#  if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
+/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif
 
 #  ifndef ARM_NEON_HASLD4
 
diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
index 0a2eeccf92..07f71b59eb 100644
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@@ -5,8 +5,8 @@
 
 #if defined(ARM_SIMD)
 #include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"
 
 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
index a96ca11799..a601e6099a 100644
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@@ -10,8 +10,8 @@
 
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"
 
 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
diff --git a/3rdparty/zlib-ng/arch/generic/Makefile.in b/3rdparty/zlib-ng/arch/generic/Makefile.in
index c717026f86..32c8242d02 100644
--- a/3rdparty/zlib-ng/arch/generic/Makefile.in
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@@ -1,5 +1,6 @@
-# Makefile for zlib
+# Makefile for zlib-ng
 # Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
 # For conditions of distribution and use, see copyright notice in zlib.h
 
 CC=
@@ -11,12 +12,62 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all:
+all: \
+ adler32_c.o adler32_c.lo \
+ adler32_fold_c.o adler32_fold_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_fold_c.o crc32_fold_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
 
 
 mostlyclean: clean
 clean:
-	rm -f *.o *.lo *~ \
+	rm -f *.o *.lo *~
 	rm -rf objs
 	rm -f *.gcda *.gcno *.gcov
 
diff --git a/3rdparty/zlib-ng/arch/generic/adler32_c.c b/3rdparty/zlib-ng/arch/generic/adler32_c.c
new file mode 100644
index 0000000000..64258c89b4
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/generic/adler32_c.c
@@ -0,0 +1,54 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
diff --git a/3rdparty/zlib-ng/adler32_fold.c b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
similarity index 83%
rename from 3rdparty/zlib-ng/adler32_fold.c
rename to 3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
index e2f6f9ac7d..397dd10400 100644
--- a/3rdparty/zlib-ng/adler32_fold.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
@@ -5,12 +5,11 @@
 
 #include "zbuild.h"
 #include "functable.h"
-#include "adler32_fold.h"
 
 #include <limits.h>
 
 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-    adler = functable.adler32(adler, src, len);
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
     memcpy(dst, src, len);
     return adler;
 }
diff --git a/3rdparty/zlib-ng/chunkset.c b/3rdparty/zlib-ng/arch/generic/chunkset_c.c
similarity index 100%
rename from 3rdparty/zlib-ng/chunkset.c
rename to 3rdparty/zlib-ng/arch/generic/chunkset_c.c
diff --git a/3rdparty/zlib-ng/compare256.c b/3rdparty/zlib-ng/arch/generic/compare256_c.c
similarity index 99%
rename from 3rdparty/zlib-ng/compare256.c
rename to 3rdparty/zlib-ng/arch/generic/compare256_c.c
index 82551cdd57..0c12cb3a4e 100644
--- a/3rdparty/zlib-ng/compare256.c
+++ b/3rdparty/zlib-ng/arch/generic/compare256_c.c
@@ -5,6 +5,7 @@
 
 #include "zbuild.h"
 #include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"
 
 /* ALIGNED, byte comparison */
diff --git a/3rdparty/zlib-ng/crc32_braid.c b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
similarity index 79%
rename from 3rdparty/zlib-ng/crc32_braid.c
rename to 3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
index 96754b53df..7d8028f6d7 100644
--- a/3rdparty/zlib-ng/crc32_braid.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
@@ -8,43 +8,9 @@
  */
 
 #include "zbuild.h"
-#include "zutil.h"
-#include "functable.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"
 
-/* ========================================================================= */
-
-const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
-    return (const uint32_t *)crc_table;
-}
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return functable.crc32(crc, buf, len);
-}
-#endif
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
-    return PREFIX(crc32_z)(crc, buf, len);
-}
-#endif
-
-/* ========================================================================= */
-
 /*
   A CRC of a message is computed on N braids of words in the message, where
   each word consists of W bytes (4 or 8). If N is 3, for example, then three
@@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
   level. Your mileage may vary.
 */
 
-/* ========================================================================= */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define ZSWAPWORD(word) (word)
-#  define BRAID_TABLE crc_braid_table
-#elif BYTE_ORDER == BIG_ENDIAN
-#  if W == 8
-#    define ZSWAPWORD(word) ZSWAP64(word)
-#  elif W == 4
-#    define ZSWAPWORD(word) ZSWAP32(word)
-#  endif
-#  define BRAID_TABLE crc_braid_big_table
-#else
-#  error "No endian defined"
-#endif
-#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
 /* ========================================================================= */
 #ifdef W
 /*
@@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {
 
 /* ========================================================================= */
 Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
-    Z_REGISTER uint32_t c;
+    uint32_t c;
 
     /* Pre-condition the CRC */
     c = (~crc) & 0xffffffff;
diff --git a/3rdparty/zlib-ng/crc32_fold.c b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
similarity index 86%
rename from 3rdparty/zlib-ng/crc32_fold.c
rename to 3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
index 5b3c7c459f..43930e97c6 100644
--- a/3rdparty/zlib-ng/crc32_fold.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
@@ -3,11 +3,9 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 #include "zbuild.h"
+#include "zutil.h"
 #include "functable.h"
-
-#include "crc32_fold.h"
-
-#include <limits.h>
+#include "crc32.h"
 
 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
     crc->value = CRC32_INITIAL_VALUE;
@@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
 }
 
 Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
     memcpy(dst, src, len);
 }
 
@@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
      * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
      * init_crc is an unused argument in this context */
     Z_UNUSED(init_crc);
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
 }
 
 Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
diff --git a/3rdparty/zlib-ng/arch/generic/generic_functions.h b/3rdparty/zlib-ng/arch/generic/generic_functions.h
new file mode 100644
index 0000000000..997dd4d01e
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/generic/generic_functions.h
@@ -0,0 +1,106 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+
+uint32_t chunksize_c(void);
+uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#endif
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+void     slide_hash_c(deflate_state *s);
+
+uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#    ifdef HAVE_BUILTIN_CTZ
+        uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#    endif
+#    if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+        uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#    ifdef UNALIGNED64_OK
+        uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+
+// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define longest_match_generic longest_match_unaligned_64
+#    define longest_match_slow_generic longest_match_slow_unaligned_64
+#    define compare256_generic compare256_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define longest_match_generic longest_match_unaligned_32
+#    define longest_match_slow_generic longest_match_slow_unaligned_32
+#    define compare256_generic compare256_unaligned_32
+#  else
+#    define longest_match_generic longest_match_unaligned_16
+#    define longest_match_slow_generic longest_match_slow_unaligned_16
+#    define compare256_generic compare256_unaligned_16
+#  endif
+#else
+#  define longest_match_generic longest_match_c
+#  define longest_match_slow_generic longest_match_slow_c
+#  define compare256_generic compare256_c
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_fold_copy adler32_fold_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#  define native_chunksize chunksize_c
+#  define native_crc32 PREFIX(crc32_braid)
+#  define native_crc32_fold crc32_fold_c
+#  define native_crc32_fold_copy crc32_fold_copy_c
+#  define native_crc32_fold_final crc32_fold_final_c
+#  define native_crc32_fold_reset crc32_fold_reset_c
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_generic
+#  define native_longest_match_slow longest_match_slow_generic
+#  define native_compare256 compare256_generic
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/slide_hash.c b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
similarity index 96%
rename from 3rdparty/zlib-ng/slide_hash.c
rename to 3rdparty/zlib-ng/arch/generic/slide_hash_c.c
index b9fbbdb69f..8345b9e36b 100644
--- a/3rdparty/zlib-ng/slide_hash.c
+++ b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
@@ -1,6 +1,6 @@
 /* slide_hash.c -- slide hash table C implementation
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
diff --git a/3rdparty/zlib-ng/arch/power/chunkset_power8.c b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
index 7cbb8029b3..aef1973273 100644
--- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@@ -4,7 +4,7 @@
 
 #ifdef POWER8_VSX
 #include <altivec.h>
-#include "../../zbuild.h"
+#include "zbuild.h"
 
 typedef vector unsigned char chunk_t;
 
diff --git a/3rdparty/zlib-ng/arch/power/compare256_power9.c b/3rdparty/zlib-ng/arch/power/compare256_power9.c
index 9b0ddaf800..c8be498e4f 100644
--- a/3rdparty/zlib-ng/arch/power/compare256_power9.c
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@@ -5,8 +5,10 @@
 
 #ifdef POWER9
 #include <altivec.h>
-#include "../../zbuild.h"
-#include "../../zendian.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "zendian.h"
 
 /* Older versions of GCC misimplemented semantics for these bit counting builtins.
  * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
diff --git a/3rdparty/zlib-ng/arch/power/power_features.c b/3rdparty/zlib-ng/arch/power/power_features.c
index f73503734b..4939d1c18f 100644
--- a/3rdparty/zlib-ng/arch/power/power_features.c
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@@ -1,16 +1,19 @@
 /* power_features.c - POWER feature check
  * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
- * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #ifdef HAVE_SYS_AUXV_H
 #  include <sys/auxv.h>
 #endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
 #ifdef __FreeBSD__
 #  include <machine/cpu.h>
 #endif
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "power_features.h"
 
 void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
diff --git a/3rdparty/zlib-ng/arch/power/power_features.h b/3rdparty/zlib-ng/arch/power/power_features.h
index 9252364cc4..1ff51de5dd 100644
--- a/3rdparty/zlib-ng/arch/power/power_features.h
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@@ -4,8 +4,8 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef POWER_H_
-#define POWER_H_
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_
 
 struct power_cpu_features {
     int has_altivec;
@@ -15,4 +15,4 @@ struct power_cpu_features {
 
 void Z_INTERNAL power_check_features(struct power_cpu_features *features);
 
-#endif /* POWER_H_ */
+#endif /* POWER_FEATURES_H_ */
diff --git a/3rdparty/zlib-ng/arch/power/power_functions.h b/3rdparty/zlib-ng/arch/power/power_functions.h
new file mode 100644
index 0000000000..cb6b7650ec
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/power_functions.h
@@ -0,0 +1,67 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_power8(void);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_chunksize
+#    define native_chunksize chunksize_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#  endif
+// Power9
+#  if defined(POWER9) && defined(_ARCH_PWR9)
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
diff --git a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
index da46f37e73..d0f9aaa567 100644
--- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@@ -9,8 +9,8 @@
 #include <riscv_vector.h>
 #include <stdint.h>
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 
 static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
     /* split Adler-32 into component sums */
diff --git a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
index 0fd6082c44..3d6c3e3aa5 100644
--- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@@ -6,7 +6,9 @@
 
 #ifdef RISCV_RVV
 
-#include "../../zbuild.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"
 
 #include <riscv_vector.h>
diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.c b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
index b066f427e0..1e3f45e0a7 100644
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@@ -1,10 +1,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/auxv.h>
 #include <sys/utsname.h>
 
-#include "../../zbuild.h"
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#endif
+
+#include "zbuild.h"
 #include "riscv_features.h"
 
 #define ISA_V_HWCAP (1 << ('v' - 'a'))
@@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
 }
 
 void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
     unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+    unsigned long hw_cap = 0;
+#endif
     features->has_rvv = hw_cap & ISA_V_HWCAP;
 }
 
diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.h b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
index c76e967c36..b1593acc25 100644
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@@ -6,8 +6,8 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef RISCV_H_
-#define RISCV_H_
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_
 
 struct riscv_cpu_features {
     int has_rvv;
@@ -15,4 +15,4 @@ struct riscv_cpu_features {
 
 void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
 
-#endif /* RISCV_H_ */
+#endif /* RISCV_FEATURES_H_ */
diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_functions.h b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
new file mode 100644
index 0000000000..015b2fbd75
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
@@ -0,0 +1,49 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_rvv(void);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+#  if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
+#    undef native_adler32
+#    define native_adler32 adler32_rvv
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_rvv
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_rvv
+#    undef native_chunksize
+#    define native_chunksize chunksize_rvv
+#    undef native_compare256
+#    define native_compare256 compare256_rvv
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_rvv
+#    undef native_longest_match
+#    define native_longest_match longest_match_rvv
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_rvv
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_rvv
+#  endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
diff --git a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
index 1164e89ba2..ac28bbd9f2 100644
--- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@@ -8,18 +8,16 @@
 
 #include <riscv_vector.h>
 
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"
 
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
     size_t vl;
     while (entries > 0) {
         vl = __riscv_vsetvl_e16m4(entries);
         vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
-        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
-        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
-        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
-        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+        __riscv_vse16_v_u16m4(table, v_diff, vl);
         table += vl, entries -= vl;
     }
 }
diff --git a/3rdparty/zlib-ng/arch/s390/Makefile.in b/3rdparty/zlib-ng/arch/s390/Makefile.in
new file mode 100644
index 0000000000..e994157df2
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/Makefile.in
@@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/3rdparty/zlib-ng/arch/s390/README.md b/3rdparty/zlib-ng/arch/s390/README.md
new file mode 100644
index 0000000000..7b383cc998
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/README.md
@@ -0,0 +1,277 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+
+```
+sudo dnf install podman
+```
+
+### Add actions-runner service.
+
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Create a config file, needs github personal access token.
+
+```
+# Create file /etc/actions-runner
+repo=<owner>/<name>
+access_token=<ghp_***>
+```
+
+Access token should have the repo scope, consult
+https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
+for details.
+
+### Autostart actions-runner.
+
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+## Rebuilding the container
+
+In order to update the `gaplib-actions-runner` podman container, e.g. to get the
+latest OS security fixes, follow these steps:
+```
+# Stop actions-runner service
+sudo systemctl stop actions-runner
+
+# Delete old container
+sudo podman container rm gaplib-actions-runner
+
+# Delete old image
+sudo podman image rm localhost/zlib-ng/actions-runner
+
+# Build image
+sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
+
+# Build container
+sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
+
+# Start actions-runner service
+sudo systemctl start actions-runner
+```
diff --git a/3rdparty/zlib-ng/arch/s390/crc32-vx.c b/3rdparty/zlib-ng/arch/s390/crc32-vx.c
new file mode 100644
index 0000000000..b3dcbf7030
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/crc32-vx.c
@@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return PREFIX(crc32_braid)(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = PREFIX(crc32_braid)(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
+
+    return crc;
+}
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_common.h b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
new file mode 100644
index 0000000000..a6527ab5df
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
@@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+} ALIGNED_(8);
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+   Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+typedef struct {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+    struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+   History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+   Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
new file mode 100644
index 0000000000..90b4b96e9c
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
@@ -0,0 +1,383 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
+    PREFIX(flush_pending)(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(state->crc_fold.value);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        state->crc_fold.value = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
new file mode 100644
index 0000000000..35e2fd3f62
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
@@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
new file mode 100644
index 0000000000..ae6001ba38
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
@@ -0,0 +1,275 @@
+#include "zbuild.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+    *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+    *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    instrument_read_write(hist + param->ho, hl_high);
+    instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    __msan_unpoison(hist + param->ho, hl_high);
+    __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+    unsigned char *orig_t2 = t2;
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0");
+    Z_REGISTER void *r1 __asm__("r1");
+    Z_REGISTER unsigned char *r2 __asm__("r2");
+    Z_REGISTER size_t r3 __asm__("r3");
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+    Z_REGISTER size_t r5 __asm__("r5");
+    int cc;
+
+    /* Insert pre-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        instrument_write(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+        instrument_read(t4, t5);
+        break;
+    case DFLTCC_CMPR:
+    case DFLTCC_XPND:
+        instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        instrument_read(t4, t5);
+        instrument_write(t2, t3);
+        instrument_read_write_hist(param, hist);
+        break;
+    }
+
+    r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+    /* Insert post-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        msan_unpoison_hist(param, hist);
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        msan_unpoison_hist(param, hist);
+        break;
+    }
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    memcpy(buf, history + param->ho, hl_high);
+    memcpy(buf + hl_high, history, hl_low);
+}
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
new file mode 100644
index 0000000000..cc3cb39781
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
@@ -0,0 +1,191 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    return !state->arch.common.param.nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
new file mode 100644
index 0000000000..3623f8ed7f
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
@@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/s390/s390_features.c b/3rdparty/zlib-ng/arch/s390/s390_features.c
new file mode 100644
index 0000000000..629025d5bb
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.c
@@ -0,0 +1,14 @@
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS HWCAP_S390_VX
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
diff --git a/3rdparty/zlib-ng/arch/s390/s390_features.h b/3rdparty/zlib-ng/arch/s390/s390_features.h
new file mode 100644
index 0000000000..fb2ac14b26
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.h
@@ -0,0 +1,14 @@
+/* s390_features.h -- check for s390 features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+    int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/s390/s390_functions.h b/3rdparty/zlib-ng/arch/s390/s390_functions.h
new file mode 100644
index 0000000000..e9c67978f0
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/s390_functions.h
@@ -0,0 +1,20 @@
+/* s390_functions.h -- s390 implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FUNCTIONS_H_
+#define S390_FUNCTIONS_H_
+
+#ifdef S390_CRC32_VX
+uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+#  if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+#    undef native_crc32
+#    define native_crc32 = crc32_s390_vx
+#  endif
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
new file mode 100644
index 0000000000..cf5c3e7271
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@@ -0,0 +1,47 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+FROM    almalinux:9
+
+RUN     dnf update -y -q && \
+        dnf install -y -q --enablerepo=crb wget git which sudo jq \
+            cmake make automake autoconf m4 libtool ninja-build python3-pip \
+            gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
+            glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
+
+RUN     dnf install -y -q dotnet-sdk-6.0 && \
+        echo "Using SDK - `dotnet --version`"
+
+COPY    runner-s390x.patch /tmp/runner.patch
+COPY    runner-global.json /tmp/global.json
+
+RUN     cd /tmp && \
+        git clone -q https://github.com/actions/runner && \
+        cd runner && \
+        git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
+        git apply /tmp/runner.patch && \
+        cp -f /tmp/global.json src/global.json
+
+
+RUN     cd /tmp/runner/src && \
+        ./dev.sh layout && \
+        ./dev.sh package && \
+        rm -rf /root/.dotnet /root/.nuget
+
+RUN     useradd -c "Action Runner" -m actions-runner && \
+        usermod -L actions-runner
+
+RUN     tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
+        chown -R actions-runner:actions-runner /home/actions-runner
+
+#VOLUME  /home/actions-runner
+
+RUN     rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
+        dnf clean all
+
+USER    actions-runner
+
+# Scripts.
+COPY    fs/ /
+WORKDIR /home/actions-runner
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD     ["/usr/bin/actions-runner"]
diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
new file mode 100644
index 0000000000..b6c20b65ec
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Podman container: Gaplib Github Actions Runner
+Wants=network-online.target
+After=network-online.target
+StartLimitIntervalSec=1
+RequiresMountsFor=/run/user/1001/containers
+
+[Service]
+Environment=PODMAN_SYSTEMD_UNIT=%n
+Restart=always
+TimeoutStopSec=61
+ExecStart=/usr/bin/podman start gaplib-actions-runner
+ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
+ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
+Type=forking
+
+[Install]
+WantedBy=default.target
diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
new file mode 100644
index 0000000000..e7028fe0dd
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
@@ -0,0 +1,5 @@
+{
+  "sdk": {
+    "version": "6.0.421"
+  }
+}
diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
new file mode 100644
index 0000000000..8260f3ccdd
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
@@ -0,0 +1,243 @@
+diff --git a/src/Directory.Build.props b/src/Directory.Build.props
+index 9db5fac..f02e235 100644
+--- a/src/Directory.Build.props
++++ b/src/Directory.Build.props
+@@ -44,6 +44,9 @@
+   <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
+     <DefineConstants>$(DefineConstants);ARM64</DefineConstants>
+   </PropertyGroup>
++  <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
++    <DefineConstants>$(DefineConstants);S390X</DefineConstants>
++  </PropertyGroup>
+ 
+   <!-- Set TRACE/DEBUG vars -->
+   <PropertyGroup>
+diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
+index 383221e..1555f67 100755
+--- a/src/Misc/externals.sh
++++ b/src/Misc/externals.sh
+@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
+     acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
+     acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
+ fi
++
++if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
++    acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
++    acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
++fi
+diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
+index 14cc6ba..9b5b8e6 100755
+--- a/src/Misc/layoutroot/config.sh
++++ b/src/Misc/layoutroot/config.sh
+@@ -20,25 +20,29 @@ then
+ 
+     message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
+ 
+-    ldd ./bin/libcoreclr.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
++    ARCH=`uname -m`
++    if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
++    then
++        ldd ./bin/libcoreclr.so | grep 'not found'
++        if [ $? -eq 0 ]; then
++            echo "Dependencies is missing for Dotnet Core 6.0"
++            echo $message
++            exit 1
++        fi
+ 
+-    ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
++        ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
++        if [ $? -eq 0 ]; then
++            echo "Dependencies is missing for Dotnet Core 6.0"
++            echo $message
++            exit 1
++        fi
+ 
+-    ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
++        ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
++        if [ $? -eq 0 ]; then
++            echo "Dependencies is missing for Dotnet Core 6.0"
++            echo $message
++            exit 1
++        fi
+     fi
+ 
+     if ! [ -x "$(command -v ldconfig)" ]; then
+diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
+index 177e3c9..9545981 100644
+--- a/src/Runner.Common/Constants.cs
++++ b/src/Runner.Common/Constants.cs
+@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
+             X86,
+             X64,
+             Arm,
+-            Arm64
++            Arm64,
++	    S390x
+         }
+ 
+         public static class Runner
+@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm;
+ #elif ARM64
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
++#elif S390X
++            public static readonly Architecture PlatformArchitecture = Architecture.S390x;
+ #else
+             public static readonly Architecture PlatformArchitecture = Architecture.X64;
+ #endif
+diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
+index 97273a1..2a34430 100644
+--- a/src/Runner.Common/Util/VarUtil.cs
++++ b/src/Runner.Common/Util/VarUtil.cs
+@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
+                         return "ARM";
+                     case Constants.Architecture.Arm64:
+                         return "ARM64";
++                    case Constants.Architecture.S390x:
++                        return "S390X";
+                     default:
+                         throw new NotSupportedException(); // Should never reach here.
+                 }
+diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
+index 2042485..a9d8b46 100644
+--- a/src/Test/L0/ConstantGenerationL0.cs
++++ b/src/Test/L0/ConstantGenerationL0.cs
+@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
+                 "linux-x64",
+                 "linux-arm",
+                 "linux-arm64",
++                "linux-s390x",
+                 "osx-x64",
+                 "osx-arm64"
+             };
+diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
+index 26ba65e..6791df3 100644
+--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
++++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
+@@ -1,4 +1,4 @@
+-﻿#if !(OS_WINDOWS && ARM64)
++﻿#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+@@ -16,6 +16,7 @@ using Xunit;
+ 
+ namespace GitHub.Runner.Common.Tests.Listener
+ {
++#if !S390X // Self-update is not currently supported on S390X
+     public sealed class SelfUpdaterL0
+     {
+         private Mock<IRunnerServer> _runnerServer;
+@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
+             }
+         }
+     }
++#endif
+ }
+ #endif
+diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+index 5115a6b..dd8d198 100644
+--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
++++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+@@ -1,4 +1,4 @@
+-﻿#if !(OS_WINDOWS && ARM64)
++﻿#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
+index f6b5889..26f8e21 100644
+--- a/src/Test/L0/Worker/StepHostL0.cs
++++ b/src/Test/L0/Worker/StepHostL0.cs
+@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
+             return hc;
+         }
+ 
+-#if OS_LINUX
++#if OS_LINUX && !S390X
+         [Fact]
+         [Trait("Level", "L0")]
+         [Trait("Category", "Worker")]
+diff --git a/src/dev.sh b/src/dev.sh
+index fa637d1..8c66f37 100755
+--- a/src/dev.sh
++++ b/src/dev.sh
+@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+         case $CPU_NAME in
+             armv7l) RUNTIME_ID="linux-arm";;
+             aarch64) RUNTIME_ID="linux-arm64";;
++            s390x) RUNTIME_ID="linux-s390x";;
+         esac
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
+@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
+         exit 1
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+-    if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
++	if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm')  && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
+        echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
+        exit 1
+     fi
+@@ -199,7 +200,8 @@ function package ()
+     popd > /dev/null
+ }
+ 
+-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
++if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
++
+ 
+     # Download dotnet SDK to ../_dotnetsdk directory
+     heading "Ensure Dotnet SDK"
+@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
+     echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
+ fi
+ 
+-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
++if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
++    echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
++    export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
++fi
+ 
+ heading "Dotnet SDK Version"
+ dotnet --version
+diff --git a/src/dir.proj b/src/dir.proj
+index 056a312..8370922 100644
+--- a/src/dir.proj
++++ b/src/dir.proj
+@@ -41,8 +41,18 @@
+     </ItemGroup>
+ 
+     <Target Name="Build" DependsOnTargets="GenerateConstant">
+-        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
+-        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
++        <PropertyGroup>
++            <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
++            <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
++            <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
++                 Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
++                 In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
++                 as this would confuse the expected layout, disable that behavior as well.  -->
++            <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
++        </PropertyGroup>
++
++        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
++        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+         <Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
+     </Target>
+
diff --git a/3rdparty/zlib-ng/arch/x86/Makefile.in b/3rdparty/zlib-ng/arch/x86/Makefile.in
index 7c052469b2..c13cd179c0 100644
--- a/3rdparty/zlib-ng/arch/x86/Makefile.in
+++ b/3rdparty/zlib-ng/arch/x86/Makefile.in
@@ -35,7 +35,6 @@ all: \
 	chunkset_ssse3.o chunkset_ssse3.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_sse2.o compare256_sse2.lo \
-	insert_string_sse42.o insert_string_sse42.lo \
 	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
 	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
 	slide_hash_avx2.o slide_hash_avx2.lo \
@@ -77,12 +76,6 @@ compare256_sse2.o:
 compare256_sse2.lo:
 	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
 
-insert_string_sse42.o:
-	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
-insert_string_sse42.lo:
-	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
 crc32_pclmulqdq.o:
 	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
 
@@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
 	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
 
 crc32_vpclmulqdq.o:
-	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
 
 crc32_vpclmulqdq.lo:
-	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
 
 slide_hash_avx2.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
index e3ac6705ce..38e7f068e3 100644
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
@@ -9,24 +9,15 @@
 
 #ifdef X86_AVX2
 
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
+#include "adler32_p.h"
 #include "adler32_avx2_p.h"
 #include "x86_intrins.h"
 
-#ifdef X86_SSE42
 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
 
-#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
-#define sub32(a, b, c) adler32_ssse3(a, b, c)
-#else
-#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
-#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
-#endif
-
 static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
     if (src == NULL) return 1L;
     if (len == 0) return adler;
@@ -44,9 +35,9 @@ rem_peel:
         }
     } else if (len < 32) {
         if (COPY) {
-            return copy_sub32(adler, dst, src, len);
+            return adler32_fold_copy_sse42(adler, dst, src, len);
         } else {
-            return sub32(adler, src, len);
+            return adler32_ssse3(adler, src, len);
         }
     }
 
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
index aa6cc17018..626c4807f8 100644
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
@@ -8,10 +8,9 @@
 
 #ifdef X86_AVX512
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
@@ -33,13 +32,7 @@ rem_peel:
             _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
         }
 
-#ifdef X86_AVX2
         return adler32_avx2(adler, src, len);
-#elif defined(X86_SSSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
     }
 
     __m512i vbuf, vs1_0, vs3;
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
index 771f7ebe04..4c5cfc1cad 100644
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
@@ -9,11 +9,10 @@
 
 #ifdef X86_AVX512VNNI
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
 #include "adler32_avx2_p.h"
@@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size
 
 rem_peel:
     if (len < 32)
-#if defined(X86_SSSE3)
         return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
 
     if (len < 64)
-#ifdef X86_AVX2
         return adler32_avx2(adler, src, len);
-#elif defined(X86_SSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
 
     const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@@ -135,11 +124,7 @@ rem_peel_copy:
         __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
         _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
 
-#if defined(X86_SSSE3)
         return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
     }
 
     const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
index 257a360982..df0739d165 100644
--- a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
@@ -6,9 +6,8 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"
 #include <immintrin.h>
 
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
index ae819d632e..15e2f78ba3 100644
--- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
@@ -6,8 +6,8 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"
 
 #ifdef X86_SSSE3
diff --git a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
index c06d1b37bd..722ecd3d51 100644
--- a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
@@ -4,10 +4,7 @@
 
 #include "zbuild.h"
 
-/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
- * code size by sharing the chunkcopy functions, which will certainly compile
- * to identical machine code */
-#if defined(X86_SSSE3) && defined(X86_SSE2)
+#if defined(X86_SSSE3)
 #include <immintrin.h>
 #include "../generic/chunk_permute_table.h"
 
@@ -19,8 +16,6 @@ typedef __m128i chunk_t;
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 #define HAVE_CHUNK_MAG
-#define HAVE_CHUNKCOPY
-#define HAVE_CHUNKUNROLL
 
 static const lut_rem_pair perm_idx_lut[13] = {
     {0, 1},      /* 3 */
@@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
     return ret_vec;
 }
 
-extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-
 #define CHUNKSIZE        chunksize_ssse3
 #define CHUNKMEMSET      chunkmemset_ssse3
 #define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
-#define CHUNKCOPY        chunkcopy_sse2
-#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKCOPY        chunkcopy_ssse3
+#define CHUNKUNROLL      chunkunroll_ssse3
 
 #include "chunkset_tpl.h"
 
diff --git a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
index 1318a0e333..d2c835e4ee 100644
--- a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
@@ -3,8 +3,9 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"
 
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
diff --git a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
index aad4bd240d..216bb3a705 100644
--- a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
@@ -3,8 +3,9 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"
 
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
index 3e79928317..1ffe201dda 100644
--- a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
     __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
     __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
     __m128i xmm_crc_part = _mm_setzero_si128();
-#ifdef COPY
     char ALIGNED_(16) partial_buf[16] = { 0 };
-#else
+#ifndef COPY
     __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
     int32_t first = init_crc != 0;
 
-    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
-     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
-     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
-     * by definition can be up to 15 bytes + one full vector load. */
-    assert(len >= 31 || first == 0);
+    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
+     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
+     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
+     * up to 15 bytes + one full vector load. */
+    assert(len >= 16 || first == 0);
 #endif
     crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
     if (len < 16) {
-#ifdef COPY
         if (len == 0)
             return;
 
         memcpy(partial_buf, src, len);
         xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+#ifdef COPY
         memcpy(dst, partial_buf, len);
 #endif
         goto partial;
@@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
 
         if (algn_diff < 4 && init_crc != 0) {
             xmm_t0 = xmm_crc_part;
-            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
-            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
-            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            if (len >= 32) {
+                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            } else {
+                memcpy(partial_buf, src + 16, len - 16);
+                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+                src += 16;
+                len -= 16;
+#ifdef COPY
+                dst -= algn_diff;
+#endif
+                goto partial;
+            }
+
             src += 16;
             len -= 16;
         }
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
index 05d3b15257..3a4f6af5af 100644
--- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@@ -17,7 +17,7 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
+#include "zbuild.h"
 
 #include <immintrin.h>
 #include <wmmintrin.h>
@@ -26,8 +26,9 @@
 #  include <immintrin.h>
 #endif
 
-#include "../../crc32_fold.h"
-#include "../../crc32_braid_p.h"
+#include "crc32.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
 #include "x86_intrins.h"
 #include <assert.h>
 
@@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
     return crc->value;
 }
 
+static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c = (~crc) & 0xffffffff;
+
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    return c ^ 0xffffffff;
+}
+
 Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
-    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
-     * these short lengths might also prove to be effective */
-    if (len < 64)
-        return PREFIX(crc32_braid)(crc32, buf, len);
+    /* For lens smaller than ~12, crc32_small method is faster.
+     * But there are also minimum requirements for the pclmul functions due to alignment */
+    if (len < 16)
+        return crc32_small(crc32, buf, len);
 
     crc32_fold ALIGNED_(16) crc_state;
     CRC32_FOLD_RESET(&crc_state);
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
index ec641b4326..cad35b14ee 100644
--- a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
+++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@@ -3,7 +3,7 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+#ifdef X86_VPCLMULQDQ_CRC
 
 #define X86_VPCLMULQDQ
 #define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
diff --git a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
deleted file mode 100644
index ae092a7e47..0000000000
--- a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef X86_SSE42
-#include "../../zbuild.h"
-#include <nmmintrin.h>
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val)\
-    h = _mm_crc32_u32(h, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         update_hash_sse42
-#define INSERT_STRING       insert_string_sse42
-#define QUICK_INSERT_STRING quick_insert_string_sse42
-
-#include "../../insert_string_tpl.h"
-#endif
diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
index 94fe10c7bf..f49ad3331b 100644
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
@@ -9,8 +9,8 @@
  *
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"
 
 #include <immintrin.h>
 
diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
index 5daac4a739..cfdf7bee49 100644
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
@@ -8,8 +8,8 @@
  *
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"
 
 #include <immintrin.h>
 #include <assert.h>
diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.c b/3rdparty/zlib-ng/arch/x86/x86_features.c
index 8d11564c24..58cb4df341 100644
--- a/3rdparty/zlib-ng/arch/x86/x86_features.c
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.c
@@ -7,7 +7,7 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "x86_features.h"
 
 #ifdef _MSC_VER
@@ -15,6 +15,13 @@
 #else
 // Newer versions of GCC and clang come with cpuid.h
 #  include <cpuid.h>
+#  ifdef X86_HAVE_XSAVE_INTRIN
+#    if __GNUC__ == 8
+#      include <xsaveintrin.h>
+#    else
+#      include <immintrin.h>
+#    endif
+#  endif
 #endif
 
 #include <string.h>
@@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
     *ecx = registers[2];
     *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
     __cpuid(info, *eax, *ebx, *ecx, *edx);
 #endif
 }
@@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
     *ecx = registers[2];
     *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
     __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
 #endif
 }
 
 static inline uint64_t xgetbv(unsigned int xcr) {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
     return _xgetbv(xcr);
 #else
     uint32_t eax, edx;
@@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
 
         // check AVX512 bits if the OS supports saving ZMM registers
         if (features->has_os_save_zmm) {
-            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512f = ebx & 0x00010000;
+            if (features->has_avx512f) {
+                // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+                // AVX512(DQ,BW,VL).
+                features->has_avx512dq = ebx & 0x00020000;
+                features->has_avx512bw = ebx & 0x40000000;
+                features->has_avx512vl = ebx & 0x80000000;
+            }
+            features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+              && features->has_avx512vl;
             features->has_avx512vnni = ecx & 0x800;
         }
     }
diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.h b/3rdparty/zlib-ng/arch/x86/x86_features.h
index 4a36bde835..6daa5e3828 100644
--- a/3rdparty/zlib-ng/arch/x86/x86_features.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.h
@@ -1,14 +1,18 @@
 /* x86_features.h -- check for CPU features
-* Copyright (C) 2013 Intel Corporation Jim Kukunas
-* For conditions of distribution and use, see copyright notice in zlib.h
-*/
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
 
 #ifndef X86_FEATURES_H_
 #define X86_FEATURES_H_
 
 struct x86_cpu_features {
     int has_avx2;
-    int has_avx512;
+    int has_avx512f;
+    int has_avx512dq;
+    int has_avx512bw;
+    int has_avx512vl;
+    int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
     int has_avx512vnni;
     int has_sse2;
     int has_ssse3;
@@ -21,4 +25,4 @@ struct x86_cpu_features {
 
 void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
 
-#endif /* CPU_H_ */
+#endif /* X86_FEATURES_H_ */
diff --git a/3rdparty/zlib-ng/arch/x86/x86_functions.h b/3rdparty/zlib-ng/arch/x86/x86_functions.h
new file mode 100644
index 0000000000..5aa9b31747
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/x86_functions.h
@@ -0,0 +1,172 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#ifdef X86_SSE2
+uint32_t chunksize_sse2(void);
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+    void slide_hash_sse2(deflate_state *s);
+#  endif
+    void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx2(void);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+    void slide_hash_avx2(deflate_state *s);
+#  endif
+    void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_CRC
+uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+#  if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_sse2
+#    undef native_chunksize
+#    define native_chunksize chunksize_sse2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_sse2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_sse2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_sse2
+#      undef native_longest_match
+#      define native_longest_match longest_match_sse2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_sse2
+#    endif
+#endif
+// X86 - SSSE3
+#  if defined(X86_SSSE3) && defined(__SSSE3__)
+#    undef native_adler32
+#    define native_adler32 adler32_ssse3
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_ssse3
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_ssse3
+#  endif
+// X86 - SSE4.2
+#  if defined(X86_SSE42) && defined(__SSE4_2__)
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_sse42
+#  endif
+
+// X86 - PCLMUL
+#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
+#  undef native_crc32
+#  define native_crc32 crc32_pclmulqdq
+#  undef native_crc32_fold
+#  define native_crc32_fold crc32_fold_pclmulqdq
+#  undef native_crc32_fold_copy
+#  define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
+#  undef native_crc32_fold_final
+#  define native_crc32_fold_final crc32_fold_pclmulqdq_final
+#  undef native_crc32_fold_reset
+#  define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
+#endif
+// X86 - AVX
+#  if defined(X86_AVX2) && defined(__AVX2__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx2
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx2
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx2
+#    undef native_chunksize
+#    define native_chunksize chunksize_avx2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_avx2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_avx2
+#      undef native_longest_match
+#      define native_longest_match longest_match_avx2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_avx2
+#    endif
+#  endif
+
+// X86 - AVX512 (F,DQ,BW,Vl)
+#  if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx512
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx512
+// X86 - AVX512 (VNNI)
+#    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
+#      undef native_adler32
+#      define native_adler32 adler32_avx512_vnni
+#      undef native_adler32_fold_copy
+#      define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
+#    endif
+// X86 - VPCLMULQDQ
+#    if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
+#      undef native_crc32
+#      define native_crc32 crc32_vpclmulqdq
+#      undef native_crc32_fold
+#      define native_crc32_fold crc32_fold_vpclmulqdq
+#      undef native_crc32_fold_copy
+#      define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
+#      undef native_crc32_fold_final
+#      define native_crc32_fold_final crc32_fold_vpclmulqdq_final
+#      undef native_crc32_fold_reset
+#      define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
+#    endif
+#  endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
diff --git a/3rdparty/zlib-ng/arch/x86/x86_intrins.h b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
index 52e1085d66..0e596d18a1 100644
--- a/3rdparty/zlib-ng/arch/x86/x86_intrins.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
@@ -7,7 +7,7 @@
 #ifdef __AVX2__
 #include <immintrin.h>
 
-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
     || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
 static inline __m256i _mm256_zextsi128_si256(__m128i a) {
     __m128i r;
@@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 /* GCC <9 is missing some AVX512 intrinsics.
  */
 #ifdef __AVX512F__
-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
 #include <immintrin.h>
 
 #define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
diff --git a/3rdparty/zlib-ng/arch_functions.h b/3rdparty/zlib-ng/arch_functions.h
new file mode 100644
index 0000000000..9a7f8d9379
--- /dev/null
+++ b/3rdparty/zlib-ng/arch_functions.h
@@ -0,0 +1,29 @@
+/* arch_functions.h -- Arch-specific function prototypes.
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FUNCTIONS_H_
+#define CPU_FUNCTIONS_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "crc32.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#include "arch/generic/generic_functions.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_functions.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_functions.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_functions.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_functions.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_functions.h"
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/chunkset_tpl.h b/3rdparty/zlib-ng/chunkset_tpl.h
index f909a12557..f5cc5c0450 100644
--- a/3rdparty/zlib-ng/chunkset_tpl.h
+++ b/3rdparty/zlib-ng/chunkset_tpl.h
@@ -5,7 +5,7 @@
 #include "zbuild.h"
 #include <stdlib.h>
 
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
 extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
 #endif
 
@@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
    without iteration, which will hopefully make the branch prediction more
    reliable. */
 #ifndef HAVE_CHUNKCOPY
-Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
     Assert(len > 0, "chunkcopy should never have a length 0");
     chunk_t chunk;
     int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
@@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    least 258 bytes of output space available (258 being the maximum length
    output from a single token; see inflate_fast()'s assumptions below). */
 #ifndef HAVE_CHUNKUNROLL
-Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
     unsigned char const *from = out - *dist;
     chunk_t chunk;
     while (*dist < *len && *dist < sizeof(chunk_t)) {
@@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
        Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
     Assert(dist > 0, "chunkmemset cannot have a distance 0");
     /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
     if (len <= 16) {
         return chunkmemset_ssse3(out, dist, len);
     }
diff --git a/3rdparty/zlib-ng/cmake/detect-arch.c b/3rdparty/zlib-ng/cmake/detect-arch.c
new file mode 100644
index 0000000000..92590182c2
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-arch.c
@@ -0,0 +1,115 @@
+// archdetect.c -- Detect compiler architecture and raise preprocessor error
+//                 containing a simple arch identifier.
+// Copyright (C) 2019 Hans Kristian Rosbach
+// Licensed under the Zlib license, see LICENSE.md for details
+
+// x86_64
+#if defined(__x86_64__) || defined(_M_X64)
+    #error archfound x86_64
+
+// x86
+#elif defined(__i386) || defined(_M_IX86)
+    #error archfound i686
+
+// ARM
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #error archfound aarch64
+#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
+        #error archfound armv8
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+        #error archfound armv7
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
+        #error archfound armv6
+    #elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+        #error archfound armv5
+    #elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
+        #error archfound armv4
+    #elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
+        #error archfound armv3
+    #elif defined(__ARM_ARCH_2__)
+        #error archfound armv2
+    #endif
+
+// PowerPC
+#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
+    #if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #error archfound powerpc64le
+        #else
+            #error archfound powerpc64
+        #endif
+    #else
+        #error archfound powerpc
+    #endif
+
+// --------------- Less common architectures alphabetically below ---------------
+
+// ALPHA
+#elif defined(__alpha__) || defined(__alpha)
+    #error archfound alpha
+
+// Blackfin
+#elif defined(__BFIN__)
+    #error archfound blackfin
+
+// Itanium
+#elif defined(__ia64) || defined(_M_IA64)
+    #error archfound ia64
+
+// MIPS
+#elif defined(__mips__) || defined(__mips)
+    #error archfound mips
+
+// Motorola 68000-series
+#elif defined(__m68k__)
+    #error archfound m68k
+
+// SuperH
+#elif defined(__sh__)
+    #error archfound sh
+
+// SPARC
+#elif defined(__sparc__) || defined(__sparc)
+    #if defined(__sparcv9) || defined(__sparc_v9__)
+        #error archfound sparc9
+    #elif defined(__sparcv8) || defined(__sparc_v8__)
+        #error archfound sparc8
+    #endif
+
+// SystemZ
+#elif defined(__370__)
+    #error archfound s370
+#elif defined(__s390__)
+    #error archfound s390
+#elif defined(__s390x) || defined(__zarch__)
+    #error archfound s390x
+
+// PARISC
+#elif defined(__hppa__)
+    #error archfound parisc
+
+// RS-6000
+#elif defined(__THW_RS6000)
+    #error archfound rs6000
+
+// RISC-V
+#elif defined(__riscv)
+    #if __riscv_xlen == 64
+        #error archfound riscv64
+    #elif __riscv_xlen == 32
+        #error archfound riscv32
+    #endif
+
+// LOONGARCH
+#elif defined(__loongarch_lp64)
+    #error archfound loongarch64
+
+// Emscripten (WebAssembly)
+#elif defined(__EMSCRIPTEN__)
+    #error archfound wasm32
+
+// return 'unrecognized' if we do not know what architecture this is
+#else
+    #error archfound unrecognized
+#endif
diff --git a/3rdparty/zlib-ng/cmake/detect-arch.cmake b/3rdparty/zlib-ng/cmake/detect-arch.cmake
new file mode 100644
index 0000000000..dfdc6013ce
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-arch.cmake
@@ -0,0 +1,104 @@
+# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
+# Copyright (C) 2019 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+set(ARCHDETECT_FOUND TRUE)
+
+if(CMAKE_OSX_ARCHITECTURES)
+    # If multiple architectures are requested (universal build), pick only the first
+    list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
+elseif(MSVC)
+    if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
+        set(ARCH "i686")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
+        set(ARCH "x86_64")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
+        set(ARCH "arm")
+    elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64EC")
+        set(ARCH "aarch64")
+    endif()
+elseif(EMSCRIPTEN)
+    set(ARCH "wasm32")
+elseif(CMAKE_CROSSCOMPILING)
+    set(ARCH ${CMAKE_C_COMPILER_TARGET})
+else()
+    # Let preprocessor parse archdetect.c and raise an error containing the arch identifier
+    enable_language(C)
+    try_run(
+        run_result_unused
+        compile_result_unused
+        ${CMAKE_CURRENT_BINARY_DIR}
+        ${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
+        COMPILE_OUTPUT_VARIABLE RAWOUTPUT
+        CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+    )
+
+    # Find basearch tag, and extract the arch word into BASEARCH variable
+    string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
+    if(NOT ARCH)
+        set(ARCH unknown)
+    endif()
+endif()
+
+# Make sure we have ARCH set
+if(NOT ARCH OR ARCH STREQUAL "unknown")
+    set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
+    message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
+else()
+    message(STATUS "Arch detected: '${ARCH}'")
+endif()
+
+# Base arch detection
+if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64|cortex)")
+    set(BASEARCH "arm")
+    set(BASEARCH_ARM_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
+    set(BASEARCH "ppc")
+    set(BASEARCH_PPC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "alpha")
+    set(BASEARCH "alpha")
+    set(BASEARCH_ALPHA_FOUND TRUE)
+elseif("${ARCH}" MATCHES "blackfin")
+    set(BASEARCH "blackfin")
+    set(BASEARCH_BLACKFIN_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ia64")
+    set(BASEARCH "ia64")
+    set(BASEARCH_IA64_FOUND TRUE)
+elseif("${ARCH}" MATCHES "mips")
+    set(BASEARCH "mips")
+    set(BASEARCH_MIPS_FOUND TRUE)
+elseif("${ARCH}" MATCHES "m68k")
+    set(BASEARCH "m68k")
+    set(BASEARCH_M68K_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sh")
+    set(BASEARCH "sh")
+    set(BASEARCH_SH_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sparc[89]?")
+    set(BASEARCH "sparc")
+    set(BASEARCH_SPARC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "s3[679]0x?")
+    set(BASEARCH "s360")
+    set(BASEARCH_S360_FOUND TRUE)
+elseif("${ARCH}" MATCHES "parisc")
+    set(BASEARCH "parisc")
+    set(BASEARCH_PARISC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "rs6000")
+    set(BASEARCH "rs6000")
+    set(BASEARCH_RS6000_FOUND TRUE)
+elseif("${ARCH}" MATCHES "riscv(32|64)")
+    set(BASEARCH "riscv")
+    set(BASEARCH_RISCV_FOUND TRUE)
+elseif("${ARCH}" MATCHES "loongarch64")
+    set(BASEARCH "loongarch")
+    set(BASEARCH_LOONGARCH_FOUND TRUE)
+elseif("${ARCH}" MATCHES "wasm32")
+    set(BASEARCH "wasm32")
+    set(BASEARCH_WASM32_FOUND TRUE)
+else()
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+    message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
+endif()
+message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")
diff --git a/3rdparty/zlib-ng/cmake/detect-coverage.cmake b/3rdparty/zlib-ng/cmake/detect-coverage.cmake
new file mode 100644
index 0000000000..8e67a085cd
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-coverage.cmake
@@ -0,0 +1,46 @@
+# detect-coverage.cmake -- Detect supported compiler coverage flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_code_coverage)
+    # Check for -coverage flag support for Clang/GCC
+    if(CMAKE_VERSION VERSION_LESS 3.14)
+        set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+    else()
+        set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
+    endif()
+    check_c_compiler_flag(-coverage HAVE_COVERAGE)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+    if(HAVE_COVERAGE)
+        add_compile_options(-coverage)
+        add_link_options(-coverage)
+        message(STATUS "Code coverage enabled using: -coverage")
+    else()
+        # Some versions of GCC don't support -coverage shorthand
+        if(CMAKE_VERSION VERSION_LESS 3.14)
+            set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+        else()
+            set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
+        endif()
+        check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
+        set(CMAKE_REQUIRED_LIBRARIES)
+        set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+        if(HAVE_TEST_COVERAGE)
+            add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
+            add_link_options(-lgcov -fprofile-arcs)
+            message(STATUS "Code coverage enabled using: -ftest-coverage")
+        else()
+            message(WARNING "Compiler does not support code coverage")
+            set(WITH_CODE_COVERAGE OFF)
+        endif()
+    endif()
+
+    # Set optimization level to zero for code coverage builds
+    if (WITH_CODE_COVERAGE)
+        # Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
+        set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
+    endif()
+endmacro()
diff --git a/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake b/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake
new file mode 100644
index 0000000000..a7c774f474
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake
@@ -0,0 +1,43 @@
+# detect-install-dirs.cmake -- Detect install directory parameters
+# Copyright (C) 2021 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# Determine installation directory for executables
+if (DEFINED BIN_INSTALL_DIR)
+    set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
+elseif (DEFINED INSTALL_BIN_DIR)
+    set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
+endif()
+
+# Determine installation directory for libraries
+if (DEFINED LIB_INSTALL_DIR)
+    set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
+elseif (DEFINED INSTALL_LIB_DIR)
+    set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
+endif()
+
+# Determine installation directory for include files
+if (DEFINED INC_INSTALL_DIR)
+    set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
+elseif (DEFINED INSTALL_INC_DIR)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
+endif()
+
+# Define GNU standard installation directories
+include(GNUInstallDirs)
+
+# Determine installation directory for pkgconfig files
+if (DEFINED PKGCONFIG_INSTALL_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED INSTALL_PKGCONFIG_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+else()
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
+endif()
diff --git a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
index 74ac3910b8..78e46e14bb 100644
--- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
@@ -2,40 +2,39 @@
 # Licensed under the Zlib license, see LICENSE.md for details
 
 macro(check_acle_compiler_flag)
-    if(MSVC)
-        # Both ARM and ARM64-targeting msvc support intrinsics, but
-        # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
-        if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
-            set(HAVE_ACLE_FLAG TRUE)
-        endif()
-    else()
+    if(NOT NATIVEFLAG)
         if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-            if(NOT NATIVEFLAG)
+            check_c_compiler_flag("-march=armv8-a+crc" HAVE_MARCH_ARMV8_CRC)
+            if(HAVE_MARCH_ARMV8_CRC)
                 set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
+            else()
+                check_c_compiler_flag("-march=armv8-a+crc+simd" HAVE_MARCH_ARMV8_CRC_SIMD)
+                if(HAVE_MARCH_ARMV8_CRC_SIMD)
+                    set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support")
+                endif()
             endif()
         endif()
-        # Check whether compiler supports ACLE flag
-        set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
-        check_c_source_compiles(
-            "int main() { return 0; }"
-            HAVE_ACLE_FLAG FAIL_REGEX "not supported")
-        if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
-            set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
-            # Check whether compiler supports ACLE flag
-            set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
-            check_c_source_compiles(
-                "int main() { return 0; }"
-                HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
-            set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
-            unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
-        endif()
-        set(CMAKE_REQUIRED_FLAGS)
     endif()
+    # Check whether compiler supports ARMv8 CRC intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_MSC_VER)
+        #include <intrin.h>
+        #else
+        #include <arm_acle.h>
+        #endif
+        unsigned int f(unsigned int a, unsigned int b) {
+            return __crc32w(a, b);
+        }
+        int main(void) { return 0; }"
+        HAVE_ACLE_FLAG
+    )
+    set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_armv6_compiler_flag)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
             if(HAVE_MARCH_ARMV6)
                 set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
@@ -67,21 +66,21 @@ macro(check_armv6_compiler_flag)
             return __uqsub16(a, b);
         #endif
         }
-        int main(void) { return 0; }"
+        int main(void) { return f(1,2); }"
         HAVE_ARMV6_INTRIN
     )
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_avx512_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
-        else()
-            set(AVX512FLAG "/arch:AVX512")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            else()
+                set(AVX512FLAG "/arch:AVX512")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
             # instruction scheduling unless you specify a reasonable -mtune= target
             set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
@@ -94,9 +93,9 @@ macro(check_avx512_intrinsics)
                 endif()
                 unset(HAVE_CASCADE_LAKE)
             endif()
+        elseif(MSVC)
+            set(AVX512FLAG "/arch:AVX512")
         endif()
-    elseif(MSVC)
-        set(AVX512FLAG "/arch:AVX512")
     endif()
     # Check whether compiler supports AVX512 intrinsics
     set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
@@ -109,26 +108,17 @@ macro(check_avx512_intrinsics)
         int main(void) { return 0; }"
         HAVE_AVX512_INTRIN
     )
-
-    # Evidently both GCC and clang were late to implementing these
-    check_c_source_compiles(
-        "#include <immintrin.h>
-        __mmask16 f(__mmask16 x) { return _knot_mask16(x); }
-        int main(void) { return 0; }"
-        HAVE_MASK_INTRIN
-    )
-    set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_avx512vnni_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
-        else()
-            set(AVX512VNNIFLAG "/arch:AVX512")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
+                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            else()
+                set(AVX512VNNIFLAG "/arch:AVX512")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
             if(NOT MSVC)
                 check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
@@ -139,11 +129,10 @@ macro(check_avx512vnni_intrinsics)
                 endif()
                 unset(HAVE_CASCADE_LAKE)
             endif()
+        elseif(MSVC)
+            set(AVX512VNNIFLAG "/arch:AVX512")
         endif()
-    elseif(MSVC)
-        set(AVX512VNNIFLAG "/arch:AVX512")
     endif()
-
     # Check whether compiler supports AVX512vnni intrinsics
     set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
     check_c_source_compiles(
@@ -159,18 +148,18 @@ macro(check_avx512vnni_intrinsics)
 endmacro()
 
 macro(check_avx2_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(AVX2FLAG "-mavx2")
+            else()
+                set(AVX2FLAG "/arch:AVX2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(AVX2FLAG "-mavx2")
-        else()
+        elseif(MSVC)
             set(AVX2FLAG "/arch:AVX2")
         endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
-            set(AVX2FLAG "-mavx2")
-        endif()
-    elseif(MSVC)
-        set(AVX2FLAG "/arch:AVX2")
     endif()
     # Check whether compiler supports AVX2 intrinics
     set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
@@ -187,8 +176,8 @@ macro(check_avx2_intrinsics)
 endmacro()
 
 macro(check_neon_compiler_flag)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             if("${ARCH}" MATCHES "aarch64")
                 set(NEONFLAG "-march=armv8-a+simd")
             else()
@@ -206,12 +195,52 @@ macro(check_neon_compiler_flag)
         #endif
         int main() { return 0; }"
         NEON_AVAILABLE FAIL_REGEX "not supported")
+    # Check whether compiler native flag is enough for NEON support
+    # Some GCC versions don't enable FPU (vector unit) when using -march=native
+    if(NEON_AVAILABLE AND NATIVEFLAG AND (NOT "${ARCH}" MATCHES "aarch64"))
+        check_c_source_compiles(
+            "#include <arm_neon.h>
+            uint8x16_t f(uint8x16_t x, uint8x16_t y) {
+                return vaddq_u8(x, y);
+            }
+            int main(int argc, char* argv[]) {
+                uint8x16_t a = vdupq_n_u8(argc);
+                uint8x16_t b = vdupq_n_u8(argc);
+                uint8x16_t result = f(a, b);
+                return result[0];
+            }"
+            ARM_NEON_SUPPORT_NATIVE
+        )
+        if(NOT ARM_NEON_SUPPORT_NATIVE)
+            set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG} -mfpu=neon ${ZNOLTOFLAG}")
+            check_c_source_compiles(
+                "#include <arm_neon.h>
+                uint8x16_t f(uint8x16_t x, uint8x16_t y) {
+                    return vaddq_u8(x, y);
+                }
+                int main(int argc, char* argv[]) {
+                    uint8x16_t a = vdupq_n_u8(argc);
+                    uint8x16_t b = vdupq_n_u8(argc);
+                    uint8x16_t result = f(a, b);
+                    return result[0];
+                }"
+                ARM_NEON_SUPPORT_NATIVE_MFPU
+            )
+            if(ARM_NEON_SUPPORT_NATIVE_MFPU)
+                set(NEONFLAG "-mfpu=neon")
+            else()
+                # Remove local NEON_AVAILABLE variable and overwrite the cache
+                unset(NEON_AVAILABLE)
+                set(NEON_AVAILABLE "" CACHE INTERNAL "NEON support available" FORCE)
+            endif()
+        endif()
+    endif()
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_neon_ld4_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             if("${ARCH}" MATCHES "aarch64")
                 set(NEONFLAG "-march=armv8-a+simd")
             else()
@@ -234,8 +263,8 @@ macro(check_neon_ld4_intrinsics)
 endmacro()
 
 macro(check_pclmulqdq_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
             set(PCLMULFLAG "-mpclmul")
         endif()
     endif()
@@ -257,8 +286,8 @@ macro(check_pclmulqdq_intrinsics)
 endmacro()
 
 macro(check_vpclmulqdq_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
             set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
         endif()
     endif()
@@ -341,8 +370,8 @@ macro(check_ppc_intrinsics)
 endmacro()
 
 macro(check_power8_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(POWER8FLAG "-mcpu=power8")
         endif()
     endif()
@@ -364,12 +393,27 @@ macro(check_power8_intrinsics)
         }"
         HAVE_POWER8_INTRIN
     )
+    if(NOT HAVE_POWER8_INTRIN AND HAVE_LINUX_AUXVEC_H)
+        check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <linux/auxvec.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
+            }"
+            HAVE_POWER8_INTRIN2
+        )
+        if(HAVE_POWER8_INTRIN2)
+            set(POWER8_NEED_AUXVEC_H 1)
+            set(HAVE_POWER8_INTRIN ${HAVE_POWER8_INTRIN2} CACHE INTERNAL "Have POWER8 intrinsics" FORCE)
+            unset(HAVE_POWER8_INTRIN2 CACHE)
+        endif()
+    endif()
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_rvv_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(RISCVFLAG "-march=rv64gcv")
         endif()
     endif()
@@ -399,8 +443,8 @@ macro(check_s390_intrinsics)
 endmacro()
 
 macro(check_power9_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(POWER9FLAG "-mcpu=power9")
         endif()
     endif()
@@ -422,22 +466,37 @@ macro(check_power9_intrinsics)
         }"
         HAVE_POWER9_INTRIN
     )
+    if(NOT HAVE_POWER9_INTRIN AND HAVE_LINUX_AUXVEC_H)
+        check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <linux/auxvec.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
+            }"
+            HAVE_POWER9_INTRIN2
+        )
+        if(HAVE_POWER9_INTRIN2)
+            set(POWER9_NEED_AUXVEC_H 1)
+            set(HAVE_POWER9_INTRIN ${HAVE_POWER9_INTRIN2} CACHE INTERNAL "Have POWER9 intrinsics" FORCE)
+            unset(HAVE_POWER9_INTRIN2 CACHE)
+        endif()
+    endif()
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
 macro(check_sse2_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSE2FLAG "-msse2")
-        else()
-            set(SSE2FLAG "/arch:SSE2")
-        endif()
-    elseif(MSVC)
-        if(NOT "${ARCH}" MATCHES "x86_64")
-            set(SSE2FLAG "/arch:SSE2")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE2FLAG "-msse2")
+            else()
+                set(SSE2FLAG "/arch:SSE2")
+            endif()
+        elseif(MSVC)
+            if(NOT "${ARCH}" MATCHES "x86_64")
+                set(SSE2FLAG "/arch:SSE2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(SSE2FLAG "-msse2")
         endif()
     endif()
@@ -453,14 +512,14 @@ macro(check_sse2_intrinsics)
 endmacro()
 
 macro(check_ssse3_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSSE3FLAG "-mssse3")
-        else()
-            set(SSSE3FLAG "/arch:SSSE3")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSSE3FLAG "-mssse3")
+            else()
+                set(SSSE3FLAG "/arch:SSSE3")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(SSSE3FLAG "-mssse3")
         endif()
     endif()
@@ -478,14 +537,14 @@ macro(check_ssse3_intrinsics)
 endmacro()
 
 macro(check_sse42_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSE42FLAG "-msse4.2")
-        else()
-            set(SSE42FLAG "/arch:SSE4.2")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE42FLAG "-msse4.2")
+            else()
+                set(SSE42FLAG "/arch:SSE4.2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(SSE42FLAG "-msse4.2")
         endif()
     endif()
@@ -526,15 +585,17 @@ macro(check_vgfma_intrinsics)
 endmacro()
 
 macro(check_xsave_intrinsics)
-    if(NOT NATIVEFLAG AND NOT MSVC)
+    if(NOT NATIVEFLAG AND NOT MSVC AND NOT CMAKE_C_COMPILER_ID MATCHES "Intel")
         set(XSAVEFLAG "-mxsave")
     endif()
     set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
     check_c_source_compiles(
         "#ifdef _MSC_VER
         #  include <intrin.h>
+        #elif __GNUC__ == 8 && __GNUC_MINOR__ > 1
+        #  include <xsaveintrin.h>
         #else
-        #  include <x86gprintrin.h>
+        #  include <immintrin.h>
         #endif
         unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
         int main(void) { return 0; }"
diff --git a/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake b/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake
new file mode 100644
index 0000000000..f9521ec2f5
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake
@@ -0,0 +1,166 @@
+# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_common_sanitizer_flags)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        add_compile_options(-g3)
+    endif()
+    check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
+    if(HAVE_NO_OMIT_FRAME_POINTER)
+        add_compile_options(-fno-omit-frame-pointer)
+        add_link_options(-fno-omit-frame-pointer)
+    endif()
+    check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+    if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+        add_compile_options(-fno-optimize-sibling-calls)
+        add_link_options(-fno-optimize-sibling-calls)
+    endif()
+endmacro()
+
+macro(check_sanitizer_support known_checks supported_checks)
+    set(available_checks "")
+
+    # Build list of supported sanitizer flags by incrementally trying compilation with
+    # known sanitizer checks
+
+    foreach(check ${known_checks})
+        if(available_checks STREQUAL "")
+            set(compile_checks "${check}")
+        else()
+            set(compile_checks "${available_checks},${check}")
+        endif()
+
+        set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
+
+        check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
+            FAIL_REGEX "not supported|unrecognized command|unknown option")
+
+        set(CMAKE_REQUIRED_FLAGS)
+
+        if(HAVE_SANITIZER_${check})
+            set(available_checks ${compile_checks})
+        endif()
+    endforeach()
+
+    set(${supported_checks} ${available_checks})
+endmacro()
+
+macro(add_address_sanitizer)
+    set(known_checks
+        address
+        pointer-compare
+        pointer-subtract
+        )
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Address sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Address sanitizer is not supported")
+    endif()
+
+    if(CMAKE_CROSSCOMPILING_EMULATOR)
+        # Only check for leak sanitizer if not cross-compiling due to qemu crash
+        message(WARNING "Leak sanitizer is not supported when cross compiling")
+    else()
+        # Leak sanitizer requires address sanitizer
+        check_sanitizer_support("leak" supported_checks)
+        if(NOT ${supported_checks} STREQUAL "")
+            message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
+            add_compile_options(-fsanitize=${supported_checks})
+            add_link_options(-fsanitize=${supported_checks})
+            add_common_sanitizer_flags()
+        else()
+            message(STATUS "Leak sanitizer is not supported")
+        endif()
+    endif()
+endmacro()
+
+macro(add_memory_sanitizer)
+    check_sanitizer_support("memory" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+
+        check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
+        if(HAVE_MEMORY_TRACK_ORIGINS)
+            add_compile_options(-fsanitize-memory-track-origins)
+            add_link_options(-fsanitize-memory-track-origins)
+        endif()
+    else()
+        message(STATUS "Memory sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_thread_sanitizer)
+    check_sanitizer_support("thread" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Thread sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_undefined_sanitizer)
+    set(known_checks
+        array-bounds
+        bool
+        bounds
+        builtin
+        enum
+        float-cast-overflow
+        float-divide-by-zero
+        function
+        integer-divide-by-zero
+        local-bounds
+        null
+        nonnull-attribute
+        pointer-overflow
+        return
+        returns-nonnull-attribute
+        shift
+        shift-base
+        shift-exponent
+        signed-integer-overflow
+        undefined
+        unsigned-integer-overflow
+        unsigned-shift-base
+        vla-bound
+        vptr
+        )
+
+    # Only check for alignment sanitizer flag if unaligned access is not supported
+    if(NOT WITH_UNALIGNED)
+        list(APPEND known_checks alignment)
+    endif()
+    # Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
+    if(NOT CMAKE_C_FLAGS MATCHES "-O0")
+        list(APPEND known_checks object-size)
+    endif()
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+
+        # Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
+        # it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
+        if(WITH_UNALIGNED)
+            add_compile_options(-fno-sanitize=alignment)
+        endif()
+
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Undefined behavior sanitizer is not supported")
+    endif()
+endmacro()
diff --git a/3rdparty/zlib-ng/cpu_features.h b/3rdparty/zlib-ng/cpu_features.h
index 00fa6c747c..8708724bc0 100644
--- a/3rdparty/zlib-ng/cpu_features.h
+++ b/3rdparty/zlib-ng/cpu_features.h
@@ -6,12 +6,10 @@
 #ifndef CPU_FEATURES_H_
 #define CPU_FEATURES_H_
 
-#include "adler32_fold.h"
-#include "crc32_fold.h"
+#ifndef DISABLE_RUNTIME_CPU_DETECTION
 
 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
-#  include "fallback_builtins.h"
 #elif defined(ARM_FEATURES)
 #  include "arch/arm/arm_features.h"
 #elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
@@ -38,266 +36,8 @@ struct cpu_features {
 #endif
 };
 
-extern void cpu_check_features(struct cpu_features *features);
+void cpu_check_features(struct cpu_features *features);
 
-/* adler32 */
-typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
-
-extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
-#ifdef ARM_NEON
-extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef PPC_VMX
-extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_SSSE3
-extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX2
-extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX512
-extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX512VNNI
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef POWER8_VSX
-extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-
-/* adler32 folding */
-#ifdef RISCV_RVV
-extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_SSE42
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX2
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX512
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX512VNNI
-extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-
-/* CRC32 folding */
-#ifdef X86_PCLMULQDQ_CRC
-extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
-extern void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
-extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
-#endif
-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
-extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
-extern void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
-extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
-#endif
-
-/* memory chunking */
-extern uint32_t chunksize_c(void);
-extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#ifdef X86_SSE2
-extern uint32_t chunksize_sse2(void);
-extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_SSSE3
-extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_AVX2
-extern uint32_t chunksize_avx2(void);
-extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef ARM_NEON
-extern uint32_t chunksize_neon(void);
-extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef POWER8_VSX
-extern uint32_t chunksize_power8(void);
-extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t chunksize_rvv(void);
-extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-
-#ifdef ZLIB_COMPAT
-typedef struct z_stream_s z_stream;
-#else
-typedef struct zng_stream_s zng_stream;
-#endif
-
-/* inflate fast loop */
-extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
-#ifdef X86_SSE2
-extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef X86_SSSE3
-extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef X86_AVX2
-extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef ARM_NEON
-extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef POWER8_VSX
-extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef RISCV_RVV
-extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
-#endif
-
-/* CRC32 */
-typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
-
-extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
-#ifdef ARM_ACLE
-extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
-#elif defined(POWER8_VSX)
-extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
-#elif defined(S390_CRC32_VX)
-extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
-#endif
-
-/* compare256 */
-typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
-
-extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
-#ifdef HAVE_BUILTIN_CTZ
-extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
-#endif
-#ifdef POWER9
-extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
-#endif
-
-#ifdef DEFLATE_H_
-/* insert_string */
-extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
-#ifdef X86_SSE42
-extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
-#elif defined(ARM_ACLE)
-extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
-#endif
-
-/* longest_match */
-extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
-#ifdef HAVE_BUILTIN_CTZ
-extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef POWER9
-extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
-#endif
-
-/* longest_match_slow */
-extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef POWER9
-extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
-#endif
-
-/* quick_insert_string */
-extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
-#ifdef X86_SSE42
-extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
-#elif defined(ARM_ACLE)
-extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
-#endif
-
-/* slide_hash */
-typedef void (*slide_hash_func)(deflate_state *s);
-
-#ifdef X86_SSE2
-extern void slide_hash_sse2(deflate_state *s);
-#endif
-#if defined(ARM_SIMD)
-extern void slide_hash_armv6(deflate_state *s);
-#endif
-#if defined(ARM_NEON)
-extern void slide_hash_neon(deflate_state *s);
-#endif
-#if defined(PPC_VMX)
-extern void slide_hash_vmx(deflate_state *s);
-#endif
-#if defined(POWER8_VSX)
-extern void slide_hash_power8(deflate_state *s);
-#endif
-#if defined(RISCV_RVV)
-extern void slide_hash_rvv(deflate_state *s);
-#endif
-#ifdef X86_AVX2
-extern void slide_hash_avx2(deflate_state *s);
-#endif
-
-/* update_hash */
-extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
-#ifdef X86_SSE42
-extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
-#elif defined(ARM_ACLE)
-extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
-#endif
 #endif
 
 #endif
diff --git a/3rdparty/zlib-ng/crc32.c b/3rdparty/zlib-ng/crc32.c
new file mode 100644
index 0000000000..54f6ecd420
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32.c
@@ -0,0 +1,42 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "crc32_braid_tbl.h"
+
+/* ========================================================================= */
+
+const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
+    return (const uint32_t *)crc_table;
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return (unsigned long)FUNCTABLE_CALL(crc32)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return FUNCTABLE_CALL(crc32)(crc, buf, len);
+}
+#endif
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
+    return PREFIX(crc32_z)(crc, buf, len);
+}
+#endif
diff --git a/3rdparty/zlib-ng/crc32.h b/3rdparty/zlib-ng/crc32.h
new file mode 100644
index 0000000000..8c3d7a8a3e
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32.h
@@ -0,0 +1,16 @@
+/* crc32.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_H_
+#define CRC32_H_
+
+#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
+/* sizeof(__m128i) * (4 folds) */
+
+typedef struct crc32_fold_s {
+    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
+    uint32_t value;
+} crc32_fold;
+
+#endif
diff --git a/3rdparty/zlib-ng/crc32_braid_comb.c b/3rdparty/zlib-ng/crc32_braid_comb.c
index 75fb474258..f253ae10a2 100644
--- a/3rdparty/zlib-ng/crc32_braid_comb.c
+++ b/3rdparty/zlib-ng/crc32_braid_comb.c
@@ -7,7 +7,6 @@
  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
  */
 
-#include "zbuild.h"
 #include "zutil.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"
diff --git a/3rdparty/zlib-ng/crc32_braid_p.h b/3rdparty/zlib-ng/crc32_braid_p.h
index 1d8a07068a..003bf91920 100644
--- a/3rdparty/zlib-ng/crc32_braid_p.h
+++ b/3rdparty/zlib-ng/crc32_braid_p.h
@@ -1,7 +1,6 @@
 #ifndef CRC32_BRAID_P_H_
 #define CRC32_BRAID_P_H_
 
-#include "zbuild.h"
 #include "zendian.h"
 
 /* Define N */
@@ -25,7 +24,7 @@
 #  endif
 #else
 #  ifndef W
-#    if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
+#    if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
 #      define W 8
 #    else
 #      define W 4
@@ -42,9 +41,24 @@
 #  endif
 #endif
 
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define ZSWAPWORD(word) (word)
+#  define BRAID_TABLE crc_braid_table
+#elif BYTE_ORDER == BIG_ENDIAN
+#  if W == 8
+#    define ZSWAPWORD(word) ZSWAP64(word)
+#  elif W == 4
+#    define ZSWAPWORD(word) ZSWAP32(word)
+#  endif
+#  define BRAID_TABLE crc_braid_big_table
+#else
+#  error "No endian defined"
+#endif
+
+#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
 /* CRC polynomial. */
 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
 
-extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
-
 #endif /* CRC32_BRAID_P_H_ */
diff --git a/3rdparty/zlib-ng/crc32_fold.h b/3rdparty/zlib-ng/crc32_fold.h
deleted file mode 100644
index 0d2ff66967..0000000000
--- a/3rdparty/zlib-ng/crc32_fold.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* crc32_fold.h -- crc32 folding interface
- * Copyright (C) 2021 Nathan Moinvaziri
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifndef CRC32_FOLD_H_
-#define CRC32_FOLD_H_
-
-#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
-/* sizeof(__m128i) * (4 folds) */
-
-typedef struct crc32_fold_s {
-    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
-    uint32_t value;
-} crc32_fold;
-
-Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
-Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
-
-#endif
diff --git a/3rdparty/zlib-ng/deflate.c b/3rdparty/zlib-ng/deflate.c
index 2a0a20e5d2..66b5506a52 100644
--- a/3rdparty/zlib-ng/deflate.c
+++ b/3rdparty/zlib-ng/deflate.c
@@ -1,5 +1,5 @@
 /* deflate.c -- compress data using the deflation algorithm
- * Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -58,7 +58,7 @@
 # undef deflateInit2
 #endif
 
-const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jean-loup Gailly and Mark Adler ";
+const char PREFIX(deflate_copyright)[] = " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -71,14 +71,16 @@ const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jea
  */
 #ifdef S390_DFLTCC_DEFLATE
 #  include "arch/s390/dfltcc_deflate.h"
+/* DFLTCC instructions require window to be page-aligned */
+#  define PAD_WINDOW            PAD_4096
+#  define WINDOW_PAD_SIZE       4096
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_4096
 #else
-/* Memory management for the deflate state. Useful for allocating arch-specific extension blocks. */
-#  define ZALLOC_DEFLATE_STATE(strm) ((deflate_state *)ZALLOC(strm, 1, sizeof(deflate_state)))
-#  define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
-#  define ZCOPY_DEFLATE_STATE(dst, src) memcpy(dst, src, sizeof(deflate_state))
-/* Memory management for the window. Useful for allocation the aligned window. */
-#  define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
-#  define TRY_FREE_WINDOW(strm, addr) TRY_FREE(strm, addr)
+#  define PAD_WINDOW            PAD_64
+#  define WINDOW_PAD_SIZE       64
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_64
+/* Adjust the window size for the arch-specific deflate code. */
+#  define DEFLATE_ADJUST_WINDOW_SIZE(n) (n)
 /* Invoked at the beginning of deflateSetDictionary(). Useful for checking arch-specific window data. */
 #  define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
 /* Invoked at the beginning of deflateGetDictionary(). Useful for adjusting arch-specific window data. */
@@ -120,10 +122,6 @@ static void lm_set_level         (deflate_state *s, int level);
 static void lm_init              (deflate_state *s);
 Z_INTERNAL unsigned read_buf  (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
 
-extern uint32_t update_hash_roll        (deflate_state *const s, uint32_t h, uint32_t val);
-extern void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
-extern Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
-
 /* ===========================================================================
  * Local data
  */
@@ -185,17 +183,111 @@ static const config configuration_table[10] = {
     memset((unsigned char *)s->head, 0, HASH_SIZE * sizeof(*s->head)); \
   } while (0)
 
-/* ========================================================================= */
-/* This function is hidden in ZLIB_COMPAT builds. */
+
+#ifdef DEF_ALLOC_DEBUG
+#  include <stdio.h>
+#  define LOGSZ(name,size)           fprintf(stderr, "%s is %d bytes\n", name, size)
+#  define LOGSZP(name,size,loc,pad)  fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad)
+#  define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad)
+#else
+#  define LOGSZ(name,size)
+#  define LOGSZP(name,size,loc,pad)
+#  define LOGSZPL(name,size,loc,pad)
+#endif
+
+/* ===========================================================================
+ * Allocate a big buffer and divide it up into the various buffers deflate needs.
+ * Handles alignment of allocated buffer and alignment of individual buffers.
+ */
+Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits, int lit_bufsize) {
+    int curr_size = 0;
+
+    /* Define sizes */
+    int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2);
+    int prev_size = (1 << windowBits) * sizeof(Pos);
+    int head_size = HASH_SIZE * sizeof(Pos);
+    int pending_size = lit_bufsize * LIT_BUFS;
+    int state_size = sizeof(deflate_state);
+    int alloc_size = sizeof(deflate_allocs);
+
+    /* Calculate relative buffer positions and paddings */
+    LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE));
+    int window_pos = PAD_WINDOW(curr_size);
+    curr_size = window_pos + window_size;
+
+    LOGSZP("prev", prev_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int prev_pos = PAD_64(curr_size);
+    curr_size = prev_pos + prev_size;
+
+    LOGSZP("head", head_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int head_pos = PAD_64(curr_size);
+    curr_size = head_pos + head_size;
+
+    LOGSZP("pending", pending_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int pending_pos = PAD_64(curr_size);
+    curr_size = pending_pos + pending_size;
+
+    LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int state_pos = PAD_64(curr_size);
+    curr_size = state_pos + state_size;
+
+    LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16));
+    int alloc_pos = PAD_16(curr_size);
+    curr_size = alloc_pos + alloc_size;
+
+    /* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */
+    int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1));
+
+    /* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */
+    char *original_buf = strm->zalloc(strm->opaque, 1, total_size);
+    if (original_buf == NULL)
+        return NULL;
+
+    char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf));
+    LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE));
+
+    /* Initialize alloc_bufs */
+    deflate_allocs *alloc_bufs  = (struct deflate_allocs_s *)(buff + alloc_pos);
+    alloc_bufs->buf_start = (char *)original_buf;
+    alloc_bufs->zfree = strm->zfree;
+
+    /* Assign buffers */
+    alloc_bufs->window = (unsigned char *)HINT_ALIGNED_WINDOW(buff + window_pos);
+    alloc_bufs->prev = (Pos *)HINT_ALIGNED_64(buff + prev_pos);
+    alloc_bufs->head = (Pos *)HINT_ALIGNED_64(buff + head_pos);
+    alloc_bufs->pending_buf = (unsigned char *)HINT_ALIGNED_64(buff + pending_pos);
+    alloc_bufs->state = (deflate_state *)HINT_ALIGNED_16(buff + state_pos);
+
+    memset((char *)alloc_bufs->prev, 0, prev_size);
+
+    return alloc_bufs;
+}
+
+/* ===========================================================================
+ * Free all allocated deflate buffers
+ */
+static inline void free_deflate(PREFIX3(stream) *strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    if (state->alloc_bufs != NULL) {
+        deflate_allocs *alloc_bufs = state->alloc_bufs;
+        alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start);
+        strm->state = NULL;
+    }
+}
+
+/* ===========================================================================
+ * Initialize deflate state and buffers.
+ * This function is hidden in ZLIB_COMPAT builds.
+ */
 int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits,
                                             int32_t memLevel, int32_t strategy) {
     /* Todo: ignore strm->next_in if we use it as window */
-    uint32_t window_padding = 0;
     deflate_state *s;
     int wrap = 1;
 
-    /* Force initialization functable, because deflate captures function pointers from functable. */
-    functable.force_init();
+    /* Initialize functable */
+    FUNCTABLE_INIT;
 
     if (strm == NULL)
         return Z_STREAM_ERROR;
@@ -230,9 +322,19 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
     if (windowBits == 8)
         windowBits = 9;  /* until 256-byte window bug fixed */
 
-    s = ZALLOC_DEFLATE_STATE(strm);
-    if (s == NULL)
+    /* Allocate buffers */
+    int lit_bufsize = 1 << (memLevel + 6);
+    deflate_allocs *alloc_bufs = alloc_deflate(strm, windowBits, lit_bufsize);
+    if (alloc_bufs == NULL)
         return Z_MEM_ERROR;
+
+    s = alloc_bufs->state;
+    s->alloc_bufs = alloc_bufs;
+    s->window = alloc_bufs->window;
+    s->prev = alloc_bufs->prev;
+    s->head = alloc_bufs->head;
+    s->pending_buf = alloc_bufs->pending_buf;
+
     strm->state = (struct internal_state *)s;
     s->strm = strm;
     s->status = INIT_STATE;     /* to pass state test in deflateReset() */
@@ -243,18 +345,9 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
     s->w_size = 1 << s->w_bits;
     s->w_mask = s->w_size - 1;
 
-#ifdef X86_PCLMULQDQ_CRC
-    window_padding = 8;
-#endif
-
-    s->window = (unsigned char *) ZALLOC_WINDOW(strm, s->w_size + window_padding, 2*sizeof(unsigned char));
-    s->prev   = (Pos *)  ZALLOC(strm, s->w_size, sizeof(Pos));
-    memset(s->prev, 0, s->w_size * sizeof(Pos));
-    s->head   = (Pos *)  ZALLOC(strm, HASH_SIZE, sizeof(Pos));
-
     s->high_water = 0;      /* nothing written to s->window yet */
 
-    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+    s->lit_bufsize = lit_bufsize; /* 16K elements by default */
 
     /* We overlay pending_buf and sym_buf. This works since the average size
      * for length/distance pairs over any compressed block is assured to be 31
@@ -295,7 +388,6 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
      * symbols from which it is being constructed.
      */
 
-    s->pending_buf = (unsigned char *) ZALLOC(strm, s->lit_bufsize, 4);
     s->pending_buf_size = s->lit_bufsize * 4;
 
     if (s->window == NULL || s->prev == NULL || s->head == NULL || s->pending_buf == NULL) {
@@ -304,8 +396,15 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
         PREFIX(deflateEnd)(strm);
         return Z_MEM_ERROR;
     }
+
+#ifdef LIT_MEM
+    s->d_buf = (uint16_t *)(s->pending_buf + (s->lit_bufsize << 1));
+    s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
+    s->sym_end = s->lit_bufsize - 1;
+#else
     s->sym_buf = s->pending_buf + s->lit_bufsize;
     s->sym_end = (s->lit_bufsize - 1) * 3;
+#endif
     /* We avoid equality with lit_bufsize*3 because of wraparound at 64K
      * on 16 bit machines and because stored blocks are restricted to
      * 64K-1 bytes.
@@ -348,7 +447,7 @@ static int deflateStateCheck(PREFIX3(stream) *strm) {
     if (strm == NULL || strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
         return 1;
     s = strm->state;
-    if (s == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
+    if (s == NULL || s->alloc_bufs == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
         return 1;
     return 0;
 }
@@ -370,7 +469,7 @@ int32_t Z_EXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const uint8
 
     /* when using zlib wrappers, compute Adler-32 for provided dictionary */
     if (wrap == 1)
-        strm->adler = functable.adler32(strm->adler, dictionary, dictLength);
+        strm->adler = FUNCTABLE_CALL(adler32)(strm->adler, dictionary, dictLength);
     DEFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
     s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */
 
@@ -457,7 +556,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
 
 #ifdef GZIP
     if (s->wrap == 2) {
-        strm->adler = functable.crc32_fold_reset(&s->crc_fold);
+        strm->adler = FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
     } else
 #endif
         strm->adler = ADLER32_INITIAL_VALUE;
@@ -506,9 +605,17 @@ int32_t Z_EXPORT PREFIX(deflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32
     if (deflateStateCheck(strm))
         return Z_STREAM_ERROR;
     s = strm->state;
+
+#ifdef LIT_MEM
+    if (bits < 0 || bits > BIT_BUF_SIZE ||
+        (unsigned char *)s->d_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
+        return Z_BUF_ERROR;
+#else
     if (bits < 0 || bits > BIT_BUF_SIZE || bits > (int32_t)(sizeof(value) << 3) ||
         s->sym_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
         return Z_BUF_ERROR;
+#endif
+
     do {
         put = BIT_BUF_SIZE - s->bi_valid;
         put = MIN(put, bits);
@@ -555,7 +662,7 @@ int32_t Z_EXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int32_t level, int
     if (s->level != level) {
         if (s->level == 0 && s->matches != 0) {
             if (s->matches == 1) {
-                functable.slide_hash(s);
+                FUNCTABLE_CALL(slide_hash)(s);
             } else {
                 CLEAR_HASH(s);
             }
@@ -794,7 +901,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
 #ifdef GZIP
     if (s->status == GZIP_STATE) {
         /* gzip header */
-        functable.crc32_fold_reset(&s->crc_fold);
+        FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
         put_byte(s, 31);
         put_byte(s, 139);
         put_byte(s, 8);
@@ -911,7 +1018,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
                 }
             }
             put_short(s, (uint16_t)strm->adler);
-            functable.crc32_fold_reset(&s->crc_fold);
+            FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
         }
         s->status = BUSY_STATE;
 
@@ -982,7 +1089,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
     /* Write the trailer */
 #ifdef GZIP
     if (s->wrap == 2) {
-        strm->adler = functable.crc32_fold_final(&s->crc_fold);
+        strm->adler = FUNCTABLE_CALL(crc32_fold_final)(&s->crc_fold);
 
         put_uint32(s, strm->adler);
         put_uint32(s, (uint32_t)strm->total_in);
@@ -1007,21 +1114,13 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
 
 /* ========================================================================= */
 int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
-    int32_t status;
-
     if (deflateStateCheck(strm))
         return Z_STREAM_ERROR;
 
-    status = strm->state->status;
+    int32_t status = strm->state->status;
 
-    /* Deallocate in reverse order of allocations: */
-    TRY_FREE(strm, strm->state->pending_buf);
-    TRY_FREE(strm, strm->state->head);
-    TRY_FREE(strm, strm->state->prev);
-    TRY_FREE_WINDOW(strm, strm->state->window);
-
-    ZFREE_STATE(strm, strm->state);
-    strm->state = NULL;
+    /* Free allocated buffers */
+    free_deflate(strm);
 
     return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
 }
@@ -1032,7 +1131,6 @@ int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
 int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) {
     deflate_state *ds;
     deflate_state *ss;
-    uint32_t window_padding = 0;
 
     if (deflateStateCheck(source) || dest == NULL)
         return Z_STREAM_ERROR;
@@ -1041,34 +1139,39 @@ int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou
 
     memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
 
-    ds = ZALLOC_DEFLATE_STATE(dest);
-    if (ds == NULL)
+    deflate_allocs *alloc_bufs = alloc_deflate(dest, ss->w_bits, ss->lit_bufsize);
+    if (alloc_bufs == NULL)
         return Z_MEM_ERROR;
+
+    ds = alloc_bufs->state;
+
     dest->state = (struct internal_state *) ds;
-    ZCOPY_DEFLATE_STATE(ds, ss);
+    memcpy(ds, ss, sizeof(deflate_state));
     ds->strm = dest;
 
-#ifdef X86_PCLMULQDQ_CRC
-    window_padding = 8;
-#endif
-
-    ds->window = (unsigned char *) ZALLOC_WINDOW(dest, ds->w_size + window_padding, 2*sizeof(unsigned char));
-    ds->prev   = (Pos *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
-    ds->head   = (Pos *)  ZALLOC(dest, HASH_SIZE, sizeof(Pos));
-    ds->pending_buf = (unsigned char *) ZALLOC(dest, ds->lit_bufsize, 4);
+    ds->alloc_bufs = alloc_bufs;
+    ds->window = alloc_bufs->window;
+    ds->prev = alloc_bufs->prev;
+    ds->head = alloc_bufs->head;
+    ds->pending_buf = alloc_bufs->pending_buf;
 
     if (ds->window == NULL || ds->prev == NULL || ds->head == NULL || ds->pending_buf == NULL) {
         PREFIX(deflateEnd)(dest);
         return Z_MEM_ERROR;
     }
 
-    memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(unsigned char));
+    memcpy(ds->window, ss->window, DEFLATE_ADJUST_WINDOW_SIZE(ds->w_size * 2 * sizeof(unsigned char)));
     memcpy((void *)ds->prev, (void *)ss->prev, ds->w_size * sizeof(Pos));
     memcpy((void *)ds->head, (void *)ss->head, HASH_SIZE * sizeof(Pos));
-    memcpy(ds->pending_buf, ss->pending_buf, ds->pending_buf_size);
+    memcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS);
 
     ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+#ifdef LIT_MEM
+    ds->d_buf = (uint16_t *)(ds->pending_buf + (ds->lit_bufsize << 1));
+    ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
+#else
     ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
+#endif
 
     ds->l_desc.dyn_tree = ds->dyn_ltree;
     ds->d_desc.dyn_tree = ds->dyn_dtree;
@@ -1095,10 +1198,10 @@ Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf,
         memcpy(buf, strm->next_in, len);
 #ifdef GZIP
     } else if (strm->state->wrap == 2) {
-        functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
+        FUNCTABLE_CALL(crc32_fold_copy)(&strm->state->crc_fold, buf, strm->next_in, len);
 #endif
     } else if (strm->state->wrap == 1) {
-        strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len);
+        strm->adler = FUNCTABLE_CALL(adler32_fold_copy)(strm->adler, buf, strm->next_in, len);
     } else {
         memcpy(buf, strm->next_in, len);
     }
@@ -1125,9 +1228,9 @@ static void lm_set_level(deflate_state *s, int level) {
         s->insert_string = &insert_string_roll;
         s->quick_insert_string = &quick_insert_string_roll;
     } else {
-        s->update_hash = functable.update_hash;
-        s->insert_string = functable.insert_string;
-        s->quick_insert_string = functable.quick_insert_string;
+        s->update_hash = update_hash;
+        s->insert_string = insert_string;
+        s->quick_insert_string = quick_insert_string;
     }
 
     s->level = level;
@@ -1191,7 +1294,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
             s->block_start -= (int)wsize;
             if (s->insert > s->strstart)
                 s->insert = s->strstart;
-            functable.slide_hash(s);
+            FUNCTABLE_CALL(slide_hash)(s);
             more += wsize;
         }
         if (s->strm->avail_in == 0)
@@ -1217,7 +1320,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
         if (s->lookahead + s->insert >= STD_MIN_MATCH) {
             unsigned int str = s->strstart - s->insert;
             if (UNLIKELY(s->max_chain_length > 1024)) {
-                s->ins_h = s->update_hash(s, s->window[str], s->window[str+1]);
+                s->ins_h = s->update_hash(s->window[str], s->window[str+1]);
             } else if (str >= 1) {
                 s->quick_insert_string(s, str + 2 - STD_MIN_MATCH);
             }
diff --git a/3rdparty/zlib-ng/deflate.h b/3rdparty/zlib-ng/deflate.h
index 8001b47c99..e122ae1aad 100644
--- a/3rdparty/zlib-ng/deflate.h
+++ b/3rdparty/zlib-ng/deflate.h
@@ -12,8 +12,12 @@
 
 #include "zutil.h"
 #include "zendian.h"
-#include "adler32_fold.h"
-#include "crc32_fold.h"
+#include "crc32.h"
+
+#ifdef S390_DFLTCC_DEFLATE
+#  include "arch/s390/dfltcc_common.h"
+#  define HAVE_ARCH_DEFLATE_STATE
+#endif
 
 /* define NO_GZIP when compiling if you want to disable gzip header and
    trailer creation by deflate().  NO_GZIP would be used to avoid linking in
@@ -23,6 +27,12 @@
 #  define GZIP
 #endif
 
+/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
+   the cost of a larger memory footprint */
+#ifndef NO_LIT_MEM
+#  define LIT_MEM
+#endif
+
 /* ===========================================================================
  * Internal compression state.
  */
@@ -108,11 +118,30 @@ typedef uint16_t Pos;
 /* Type definitions for hash callbacks */
 typedef struct internal_state deflate_state;
 
-typedef uint32_t (* update_hash_cb)        (deflate_state *const s, uint32_t h, uint32_t val);
+typedef uint32_t (* update_hash_cb)        (uint32_t h, uint32_t val);
 typedef void     (* insert_string_cb)      (deflate_state *const s, uint32_t str, uint32_t count);
 typedef Pos      (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
 
-struct internal_state {
+uint32_t update_hash             (uint32_t h, uint32_t val);
+void     insert_string           (deflate_state *const s, uint32_t str, uint32_t count);
+Pos      quick_insert_string     (deflate_state *const s, uint32_t str);
+
+uint32_t update_hash_roll        (uint32_t h, uint32_t val);
+void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
+Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
+
+/* Struct for memory allocation handling */
+typedef struct deflate_allocs_s {
+    char            *buf_start;
+    free_func        zfree;
+    deflate_state   *state;
+    unsigned char   *window;
+    unsigned char   *pending_buf;
+    Pos             *prev;
+    Pos             *head;
+} deflate_allocs;
+
+struct ALIGNED_(64) internal_state {
     PREFIX3(stream)      *strm;            /* pointer back to this zlib stream */
     unsigned char        *pending_buf;     /* output still pending */
     unsigned char        *pending_out;     /* next pending byte to output to the stream */
@@ -260,8 +289,16 @@ struct internal_state {
      *   - I can't count above 4
      */
 
+#ifdef LIT_MEM
+#   define LIT_BUFS 5
+    uint16_t *d_buf;              /* buffer for distances */
+    unsigned char *l_buf;         /* buffer for literals/lengths */
+#else
+#   define LIT_BUFS 4
     unsigned char *sym_buf;       /* buffer for distances and literals/lengths */
-    unsigned int sym_next;        /* running index in sym_buf */
+#endif
+
+    unsigned int sym_next;        /* running index in symbol buffer */
     unsigned int sym_end;         /* symbol table full when sym_next reaches this */
 
     unsigned long opt_len;        /* bit length of current block with optimal trees */
@@ -273,8 +310,11 @@ struct internal_state {
     unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
     unsigned long bits_sent;      /* bit length of compressed data sent mod 2^32 */
 
-    /* Reserved for future use and alignment purposes */
-    char *reserved_p;
+    deflate_allocs *alloc_bufs;
+
+#ifdef HAVE_ARCH_DEFLATE_STATE
+    arch_deflate_state arch;      /* architecture-specific extensions */
+#endif
 
     uint64_t bi_buf;
     /* Output buffer. bits are inserted starting at the bottom (least significant bits). */
@@ -284,7 +324,7 @@ struct internal_state {
 
     /* Reserved for future use and alignment purposes */
     int32_t reserved[11];
-} ALIGNED_(8);
+};
 
 typedef enum {
     need_more,      /* block not completed, need more input or more output */
diff --git a/3rdparty/zlib-ng/deflate_fast.c b/3rdparty/zlib-ng/deflate_fast.c
index 3184aa718c..2d0444cd73 100644
--- a/3rdparty/zlib-ng/deflate_fast.c
+++ b/3rdparty/zlib-ng/deflate_fast.c
@@ -1,6 +1,6 @@
 /* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -41,7 +41,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
          * dictionary, and set hash_head to the head of the hash chain:
          */
         if (s->lookahead >= WANT_MIN_MATCH) {
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);
             dist = (int64_t)s->strstart - hash_head;
 
             /* Find the longest match, discarding those <= prev_length.
@@ -52,7 +52,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
                  * of window index 0 (in particular we have to avoid a match
                  * of the string with itself at the start of the input file).
                  */
-                match_len = functable.longest_match(s, hash_head);
+                match_len = FUNCTABLE_CALL(longest_match)(s, hash_head);
                 /* longest_match() sets match_start */
             }
         }
@@ -71,11 +71,11 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
                 match_len--; /* string at strstart already in table */
                 s->strstart++;
 
-                functable.insert_string(s, s->strstart, match_len);
+                insert_string(s, s->strstart, match_len);
                 s->strstart += match_len;
             } else {
                 s->strstart += match_len;
-                functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
+                quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
 
                 /* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
                  * matter since it will be recomputed at next deflate call.
diff --git a/3rdparty/zlib-ng/deflate_huff.c b/3rdparty/zlib-ng/deflate_huff.c
index b197e24d7c..d5a234b114 100644
--- a/3rdparty/zlib-ng/deflate_huff.c
+++ b/3rdparty/zlib-ng/deflate_huff.c
@@ -1,6 +1,6 @@
 /* deflate_huff.c -- compress data using huffman encoding only strategy
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
diff --git a/3rdparty/zlib-ng/deflate_medium.c b/3rdparty/zlib-ng/deflate_medium.c
index 47796e3221..2aeebe2026 100644
--- a/3rdparty/zlib-ng/deflate_medium.c
+++ b/3rdparty/zlib-ng/deflate_medium.c
@@ -45,16 +45,18 @@ static void insert_match(deflate_state *s, struct match match) {
     if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
         return;
 
+    /* string at strstart already in table */
+    match.strstart++;
+    match.match_length--;
+
     /* matches that are not long enough we need to emit as literals */
-    if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
-        match.strstart++;
-        match.match_length--;
+    if (LIKELY(match.match_length < WANT_MIN_MATCH - 1)) {
         if (UNLIKELY(match.match_length > 0)) {
             if (match.strstart >= match.orgstart) {
                 if (match.strstart + match.match_length - 1 >= match.orgstart) {
-                    functable.insert_string(s, match.strstart, match.match_length);
+                    insert_string(s, match.strstart, match.match_length);
                 } else {
-                    functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+                    insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
                 }
                 match.strstart += match.match_length;
                 match.match_length = 0;
@@ -63,35 +65,18 @@ static void insert_match(deflate_state *s, struct match match) {
         return;
     }
 
-    /* Insert new strings in the hash table only if the match length
-     * is not too large. This saves time but degrades compression.
-     */
-    if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
-        match.match_length--; /* string at strstart already in table */
-        match.strstart++;
-
-        if (LIKELY(match.strstart >= match.orgstart)) {
-            if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
-                functable.insert_string(s, match.strstart, match.match_length);
-            } else {
-                functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
-            }
-        } else if (match.orgstart < match.strstart + match.match_length) {
-            functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
+    /* Insert into hash table. */
+    if (LIKELY(match.strstart >= match.orgstart)) {
+        if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
+            insert_string(s, match.strstart, match.match_length);
+        } else {
+            insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
         }
-        match.strstart += match.match_length;
-        match.match_length = 0;
-    } else {
-        match.strstart += match.match_length;
-        match.match_length = 0;
-
-        if (match.strstart >= (STD_MIN_MATCH - 2))
-            functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
-
-        /* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
-         * matter since it will be recomputed at next deflate call.
-         */
+    } else if (match.orgstart < match.strstart + match.match_length) {
+        insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
     }
+    match.strstart += match.match_length;
+    match.match_length = 0;
 }
 
 static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
@@ -199,7 +184,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
         } else {
             hash_head = 0;
             if (s->lookahead >= WANT_MIN_MATCH) {
-                hash_head = functable.quick_insert_string(s, s->strstart);
+                hash_head = quick_insert_string(s, s->strstart);
             }
 
             current_match.strstart = (uint16_t)s->strstart;
@@ -215,7 +200,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
                  * of window index 0 (in particular we have to avoid a match
                  * of the string with itself at the start of the input file).
                  */
-                current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                current_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
                 current_match.match_start = (uint16_t)s->match_start;
                 if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
                     current_match.match_length = 1;
@@ -235,7 +220,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
         /* now, look ahead one */
         if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
             s->strstart = current_match.strstart + current_match.match_length;
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);
 
             next_match.strstart = (uint16_t)s->strstart;
             next_match.orgstart = next_match.strstart;
@@ -250,7 +235,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
                  * of window index 0 (in particular we have to avoid a match
                  * of the string with itself at the start of the input file).
                  */
-                next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                next_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
                 next_match.match_start = (uint16_t)s->match_start;
                 if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
                     /* this can happen due to some restarts */
diff --git a/3rdparty/zlib-ng/deflate_p.h b/3rdparty/zlib-ng/deflate_p.h
index dd2021a0f5..7c74ebf5ad 100644
--- a/3rdparty/zlib-ng/deflate_p.h
+++ b/3rdparty/zlib-ng/deflate_p.h
@@ -1,7 +1,7 @@
 /* deflate_p.h -- Private inline functions and macros shared with more than
  *                one deflate method
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  *
  */
@@ -60,27 +60,37 @@ extern const unsigned char Z_INTERNAL zng_dist_code[];
 
 static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
     /* c is the unmatched char */
+#ifdef LIT_MEM
+    s->d_buf[s->sym_next] = 0;
+    s->l_buf[s->sym_next++] = c;
+#else
     s->sym_buf[s->sym_next++] = 0;
     s->sym_buf[s->sym_next++] = 0;
     s->sym_buf[s->sym_next++] = c;
+#endif
     s->dyn_ltree[c].Freq++;
     Tracevv((stderr, "%c", c));
     Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
     return (s->sym_next == s->sym_end);
 }
 
-static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
+static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t len) {
     /* dist: distance of matched string */
     /* len: match length-STD_MIN_MATCH */
+#ifdef LIT_MEM
+    s->d_buf[s->sym_next] = dist;
+    s->l_buf[s->sym_next++] = len;
+#else
     s->sym_buf[s->sym_next++] = (uint8_t)(dist);
     s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
     s->sym_buf[s->sym_next++] = (uint8_t)len;
+#endif
     s->matches++;
     dist--;
     Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
         "zng_tr_tally: bad match");
 
-    s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
+    s->dyn_ltree[zng_length_code[len] + LITERALS + 1].Freq++;
     s->dyn_dtree[d_code(dist)].Freq++;
     return (s->sym_next == s->sym_end);
 }
diff --git a/3rdparty/zlib-ng/deflate_quick.c b/3rdparty/zlib-ng/deflate_quick.c
index df5a17b9e6..5a1937b679 100644
--- a/3rdparty/zlib-ng/deflate_quick.c
+++ b/3rdparty/zlib-ng/deflate_quick.c
@@ -86,7 +86,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
         }
 
         if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);
             dist = (int64_t)s->strstart - hash_head;
 
             if (dist <= MAX_DIST(s) && dist > 0) {
@@ -94,7 +94,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
                 const uint8_t *match_start = s->window + hash_head;
 
                 if (zng_memcmp_2(str_start, match_start) == 0) {
-                    match_len = functable.compare256(str_start+2, match_start+2) + 2;
+                    match_len = FUNCTABLE_CALL(compare256)(str_start+2, match_start+2) + 2;
 
                     if (match_len >= WANT_MIN_MATCH) {
                         if (UNLIKELY(match_len > s->lookahead))
diff --git a/3rdparty/zlib-ng/deflate_rle.c b/3rdparty/zlib-ng/deflate_rle.c
index cd08509946..ee442141be 100644
--- a/3rdparty/zlib-ng/deflate_rle.c
+++ b/3rdparty/zlib-ng/deflate_rle.c
@@ -1,6 +1,6 @@
 /* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
diff --git a/3rdparty/zlib-ng/deflate_slow.c b/3rdparty/zlib-ng/deflate_slow.c
index 9f1c913467..de70cc1bba 100644
--- a/3rdparty/zlib-ng/deflate_slow.c
+++ b/3rdparty/zlib-ng/deflate_slow.c
@@ -1,6 +1,6 @@
 /* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -19,12 +19,12 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
     int bflush;              /* set if current block must be flushed */
     int64_t dist;
     uint32_t match_len;
-    match_func *longest_match;
+    match_func longest_match;
 
     if (s->max_chain_length <= 1024)
-        longest_match = &functable.longest_match;
+        longest_match = FUNCTABLE_FPTR(longest_match);
     else
-        longest_match = &functable.longest_match_slow;
+        longest_match = FUNCTABLE_FPTR(longest_match_slow);
 
     /* Process the input block. */
     for (;;) {
@@ -61,7 +61,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
              * of window index 0 (in particular we have to avoid a match
              * of the string with itself at the start of the input file).
              */
-            match_len = (*longest_match)(s, hash_head);
+            match_len = longest_match(s, hash_head);
             /* longest_match() sets match_start */
 
             if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
@@ -129,7 +129,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
     }
     Assert(flush != Z_NO_FLUSH, "no flush?");
     if (UNLIKELY(s->match_available)) {
-        (void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
+        Z_UNUSED(zng_tr_tally_lit(s, s->window[s->strstart-1]));
         s->match_available = 0;
     }
     s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
diff --git a/3rdparty/zlib-ng/deflate_stored.c b/3rdparty/zlib-ng/deflate_stored.c
index 6160896b3f..9e5acfbf96 100644
--- a/3rdparty/zlib-ng/deflate_stored.c
+++ b/3rdparty/zlib-ng/deflate_stored.c
@@ -1,6 +1,6 @@
 /* deflate_stored.c -- store data without compression using deflation algorithm
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -22,7 +22,7 @@
  *
  * deflate_stored() is written to minimize the number of times an input byte is
  * copied. It is most efficient with large input and output buffers, which
- * maximizes the opportunites to have a single copy from next_in to next_out.
+ * maximizes the opportunities to have a single copy from next_in to next_out.
  */
 Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
     /* Smallest worthy block size when not flushing or finishing. By default
diff --git a/3rdparty/zlib-ng/fallback_builtins.h b/3rdparty/zlib-ng/fallback_builtins.h
index 79072a1028..8303508fa1 100644
--- a/3rdparty/zlib-ng/fallback_builtins.h
+++ b/3rdparty/zlib-ng/fallback_builtins.h
@@ -5,9 +5,6 @@
 #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
 
 #include <intrin.h>
-#ifdef X86_FEATURES
-#  include "arch/x86/x86_features.h"
-#endif
 
 /* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
  * Because of that assumption trailing_zero is not initialized and the return value is not checked.
diff --git a/3rdparty/zlib-ng/functable.c b/3rdparty/zlib-ng/functable.c
index 37c4aeef7d..495d11edd2 100644
--- a/3rdparty/zlib-ng/functable.c
+++ b/3rdparty/zlib-ng/functable.c
@@ -2,14 +2,12 @@
  * Copyright (C) 2017 Hans Kristian Rosbach
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
+#ifndef DISABLE_RUNTIME_CPU_DETECTION
 
 #include "zbuild.h"
-#include "zendian.h"
-#include "crc32_braid_p.h"
-#include "deflate.h"
-#include "deflate_p.h"
 #include "functable.h"
 #include "cpu_features.h"
+#include "arch_functions.h"
 
 #if defined(_MSC_VER)
 #  include <intrin.h>
@@ -61,31 +59,10 @@ static void init_functable(void) {
     ft.crc32_fold_final = &crc32_fold_final_c;
     ft.crc32_fold_reset = &crc32_fold_reset_c;
     ft.inflate_fast = &inflate_fast_c;
-    ft.insert_string = &insert_string_c;
-    ft.quick_insert_string = &quick_insert_string_c;
     ft.slide_hash = &slide_hash_c;
-    ft.update_hash = &update_hash_c;
-
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-    ft.longest_match = &longest_match_unaligned_64;
-    ft.longest_match_slow = &longest_match_slow_unaligned_64;
-    ft.compare256 = &compare256_unaligned_64;
-#  elif defined(HAVE_BUILTIN_CTZ)
-    ft.longest_match = &longest_match_unaligned_32;
-    ft.longest_match_slow = &longest_match_slow_unaligned_32;
-    ft.compare256 = &compare256_unaligned_32;
-#  else
-    ft.longest_match = &longest_match_unaligned_16;
-    ft.longest_match_slow = &longest_match_slow_unaligned_16;
-    ft.compare256 = &compare256_unaligned_16;
-#  endif
-#else
-    ft.longest_match = &longest_match_c;
-    ft.longest_match_slow = &longest_match_slow_c;
-    ft.compare256 = &compare256_c;
-#endif
-
+    ft.longest_match = &longest_match_generic;
+    ft.longest_match_slow = &longest_match_slow_generic;
+    ft.compare256 = &compare256_generic;
 
     // Select arch-optimized functions
 
@@ -110,19 +87,14 @@ static void init_functable(void) {
 #ifdef X86_SSSE3
     if (cf.x86.has_ssse3) {
         ft.adler32 = &adler32_ssse3;
-#  ifdef X86_SSE2
         ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
         ft.inflate_fast = &inflate_fast_ssse3;
-#  endif
     }
 #endif
     // X86 - SSE4.2
 #ifdef X86_SSE42
     if (cf.x86.has_sse42) {
         ft.adler32_fold_copy = &adler32_fold_copy_sse42;
-        ft.insert_string = &insert_string_sse42;
-        ft.quick_insert_string = &quick_insert_string_sse42;
-        ft.update_hash = &update_hash_sse42;
     }
 #endif
     // X86 - PCLMUL
@@ -151,8 +123,9 @@ static void init_functable(void) {
 #  endif
     }
 #endif
+    // X86 - AVX512 (F,DQ,BW,Vl)
 #ifdef X86_AVX512
-    if (cf.x86.has_avx512) {
+    if (cf.x86.has_avx512_common) {
         ft.adler32 = &adler32_avx512;
         ft.adler32_fold_copy = &adler32_fold_copy_avx512;
     }
@@ -164,8 +137,8 @@ static void init_functable(void) {
     }
 #endif
     // X86 - VPCLMULQDQ
-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
-    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
+#ifdef X86_VPCLMULQDQ_CRC
+    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
         ft.crc32 = &crc32_vpclmulqdq;
         ft.crc32_fold = &crc32_fold_vpclmulqdq;
         ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
@@ -206,9 +179,6 @@ static void init_functable(void) {
 #ifdef ARM_ACLE
     if (cf.arm.has_crc32) {
         ft.crc32 = &crc32_acle;
-        ft.insert_string = &insert_string_acle;
-        ft.quick_insert_string = &quick_insert_string_acle;
-        ft.update_hash = &update_hash_acle;
     }
 #endif
 
@@ -279,12 +249,9 @@ static void init_functable(void) {
     FUNCTABLE_ASSIGN(ft, crc32_fold_final);
     FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
     FUNCTABLE_ASSIGN(ft, inflate_fast);
-    FUNCTABLE_ASSIGN(ft, insert_string);
     FUNCTABLE_ASSIGN(ft, longest_match);
     FUNCTABLE_ASSIGN(ft, longest_match_slow);
-    FUNCTABLE_ASSIGN(ft, quick_insert_string);
     FUNCTABLE_ASSIGN(ft, slide_hash);
-    FUNCTABLE_ASSIGN(ft, update_hash);
 
     // Memory barrier for weak memory order CPUs
     FUNCTABLE_BARRIER();
@@ -350,11 +317,6 @@ static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
     functable.inflate_fast(strm, start);
 }
 
-static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
-    init_functable();
-    functable.insert_string(s, str, count);
-}
-
 static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
     init_functable();
     return functable.longest_match(s, cur_match);
@@ -365,21 +327,11 @@ static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
     return functable.longest_match_slow(s, cur_match);
 }
 
-static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
-    init_functable();
-    return functable.quick_insert_string(s, str);
-}
-
 static void slide_hash_stub(deflate_state* s) {
     init_functable();
     functable.slide_hash(s);
 }
 
-static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
-    init_functable();
-    return functable.update_hash(s, h, val);
-}
-
 /* functable init */
 Z_INTERNAL struct functable_s functable = {
     force_init_stub,
@@ -394,10 +346,9 @@ Z_INTERNAL struct functable_s functable = {
     crc32_fold_final_stub,
     crc32_fold_reset_stub,
     inflate_fast_stub,
-    insert_string_stub,
     longest_match_stub,
     longest_match_slow_stub,
-    quick_insert_string_stub,
     slide_hash_stub,
-    update_hash_stub
 };
+
+#endif
diff --git a/3rdparty/zlib-ng/functable.h b/3rdparty/zlib-ng/functable.h
index 9f78188e10..173a030c66 100644
--- a/3rdparty/zlib-ng/functable.h
+++ b/3rdparty/zlib-ng/functable.h
@@ -7,14 +7,21 @@
 #define FUNCTABLE_H_
 
 #include "deflate.h"
-#include "crc32_fold.h"
-#include "adler32_fold.h"
+#include "crc32.h"
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+
+#  include "arch_functions.h"
+
+/* When compiling with native instructions it is not necessary to use functable.
+ * Instead we use native_ macro indicating the best available variant of arch-specific
+ * functions for the current platform.
+ */
+#  define FUNCTABLE_INIT ((void)0)
+#  define FUNCTABLE_CALL(name) native_ ## name
+#  define FUNCTABLE_FPTR(name) &native_ ## name
 
-#ifdef ZLIB_COMPAT
-typedef struct z_stream_s z_stream;
 #else
-typedef struct zng_stream_s zng_stream;
-#endif
 
 struct functable_s {
     void     (* force_init)         (void);
@@ -29,14 +36,20 @@ struct functable_s {
     uint32_t (* crc32_fold_final)   (struct crc32_fold_s *crc);
     uint32_t (* crc32_fold_reset)   (struct crc32_fold_s *crc);
     void     (* inflate_fast)       (PREFIX3(stream) *strm, uint32_t start);
-    void     (* insert_string)      (deflate_state *const s, uint32_t str, uint32_t count);
     uint32_t (* longest_match)      (deflate_state *const s, Pos cur_match);
     uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
-    Pos      (* quick_insert_string)(deflate_state *const s, uint32_t str);
     void     (* slide_hash)         (deflate_state *s);
-    uint32_t (* update_hash)        (deflate_state *const s, uint32_t h, uint32_t val);
 };
 
 Z_INTERNAL extern struct functable_s functable;
 
+
+/* Explicitly indicate functions are conditionally dispatched.
+ */
+#  define FUNCTABLE_INIT functable.force_init()
+#  define FUNCTABLE_CALL(name) functable.name
+#  define FUNCTABLE_FPTR(name) functable.name
+
+#endif
+
 #endif
diff --git a/3rdparty/zlib-ng/gzguts.h b/3rdparty/zlib-ng/gzguts.h
index a663844b69..14f2391152 100644
--- a/3rdparty/zlib-ng/gzguts.h
+++ b/3rdparty/zlib-ng/gzguts.h
@@ -1,7 +1,7 @@
 #ifndef GZGUTS_H_
 #define GZGUTS_H_
 /* gzguts.h -- zlib internal header definitions for gz* operations
- * Copyright (C) 2004-2019 Mark Adler
+ * Copyright (C) 2004-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -135,7 +135,9 @@ typedef gz_state *gz_statep;
 
 /* shared functions */
 void Z_INTERNAL gz_error(gz_state *, int, const char *);
-
+#ifdef ZLIB_COMPAT
+unsigned Z_INTERNAL gz_intmax(void);
+#endif
 /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
    value -- needed when comparing unsigned to z_off64_t, which is signed
    (possible z_off64_t types off_t, off64_t, and long are all signed) */
diff --git a/3rdparty/zlib-ng/gzlib.c b/3rdparty/zlib-ng/gzlib.c
index e613837efb..b8a506b6a5 100644
--- a/3rdparty/zlib-ng/gzlib.c
+++ b/3rdparty/zlib-ng/gzlib.c
@@ -1,5 +1,5 @@
 /* gzlib.c -- zlib functions common to reading and writing gzip files
- * Copyright (C) 2004-2019 Mark Adler
+ * Copyright (C) 2004-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -523,3 +523,9 @@ void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) {
     }
     (void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg);
 }
+
+#ifdef ZLIB_COMPAT
+unsigned Z_INTERNAL gz_intmax(void) {
+    return INT_MAX;
+}
+#endif
diff --git a/3rdparty/zlib-ng/infback.c b/3rdparty/zlib-ng/infback.c
index 9f5042b4d3..307d05ca3c 100644
--- a/3rdparty/zlib-ng/infback.c
+++ b/3rdparty/zlib-ng/infback.c
@@ -43,10 +43,15 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateBackInit)(PREFIX3(stream) *strm, int32_t wi
     }
     if (strm->zfree == NULL)
         strm->zfree = PREFIX(zcfree);
-    state = ZALLOC_INFLATE_STATE(strm);
-    if (state == NULL)
+
+    inflate_allocs *alloc_bufs = alloc_inflate(strm);
+    if (alloc_bufs == NULL)
         return Z_MEM_ERROR;
+
+    state = alloc_bufs->state;
+    state->alloc_bufs = alloc_bufs;
     Tracev((stderr, "inflate: allocated\n"));
+
     strm->state = (struct internal_state *)state;
     state->dmax = 32768U;
     state->wbits = (unsigned int)windowBits;
@@ -55,7 +60,7 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateBackInit)(PREFIX3(stream) *strm, int32_t wi
     state->wnext = 0;
     state->whave = 0;
     state->sane = 1;
-    state->chunksize = functable.chunksize();
+    state->chunksize = FUNCTABLE_CALL(chunksize)();
     return Z_OK;
 }
 
@@ -357,7 +362,7 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in
                 RESTORE();
                 if (state->whave < state->wsize)
                     state->whave = state->wsize - left;
-                functable.inflate_fast(strm, state->wsize);
+                FUNCTABLE_CALL(inflate_fast)(strm, state->wsize);
                 LOAD();
                 break;
             }
@@ -504,8 +509,10 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in
 int32_t Z_EXPORT PREFIX(inflateBackEnd)(PREFIX3(stream) *strm) {
     if (strm == NULL || strm->state == NULL || strm->zfree == NULL)
         return Z_STREAM_ERROR;
-    ZFREE_STATE(strm, strm->state);
-    strm->state = NULL;
+
+    /* Free allocated buffers */
+    free_inflate(strm);
+
     Tracev((stderr, "inflate: end\n"));
     return Z_OK;
 }
diff --git a/3rdparty/zlib-ng/inflate.c b/3rdparty/zlib-ng/inflate.c
index fe55c498e3..956f37db7d 100644
--- a/3rdparty/zlib-ng/inflate.c
+++ b/3rdparty/zlib-ng/inflate.c
@@ -19,7 +19,7 @@
 
 /* function prototypes */
 static int inflateStateCheck(PREFIX3(stream) *strm);
-static int updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum);
+static void updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum);
 static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t len);
 
 static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
@@ -28,11 +28,11 @@ static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
     struct inflate_state *state = (struct inflate_state*)strm->state;
 #ifdef GUNZIP
     if (state->flags) {
-        functable.crc32_fold_copy(&state->crc_fold, dst, src, copy);
+        FUNCTABLE_CALL(crc32_fold_copy)(&state->crc_fold, dst, src, copy);
     } else
 #endif
     {
-        strm->adler = state->check = functable.adler32_fold_copy(state->check, dst, src, copy);
+        strm->adler = state->check = FUNCTABLE_CALL(adler32_fold_copy)(state->check, dst, src, copy);
     }
 }
 
@@ -40,11 +40,11 @@ static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_
     struct inflate_state *state = (struct inflate_state*)strm->state;
 #ifdef GUNZIP
     if (state->flags) {
-        functable.crc32_fold(&state->crc_fold, src, len, 0);
+        FUNCTABLE_CALL(crc32_fold)(&state->crc_fold, src, len, 0);
     } else
 #endif
     {
-        strm->adler = state->check = functable.adler32(state->check, src, len);
+        strm->adler = state->check = FUNCTABLE_CALL(adler32)(state->check, src, len);
     }
 }
 
@@ -53,7 +53,7 @@ static int inflateStateCheck(PREFIX3(stream) *strm) {
     if (strm == NULL || strm->zalloc == NULL || strm->zfree == NULL)
         return 1;
     state = (struct inflate_state *)strm->state;
-    if (state == NULL || state->strm != strm || state->mode < HEAD || state->mode > SYNC)
+    if (state == NULL || state->alloc_bufs == NULL || state->strm != strm || state->mode < HEAD || state->mode > SYNC)
         return 1;
     return 0;
 }
@@ -120,13 +120,9 @@ int32_t Z_EXPORT PREFIX(inflateReset2)(PREFIX3(stream) *strm, int32_t windowBits
 #endif
     }
 
-    /* set number of window bits, free window if different */
+    /* set number of window bits */
     if (windowBits && (windowBits < MIN_WBITS || windowBits > MAX_WBITS))
         return Z_STREAM_ERROR;
-    if (state->window != NULL && state->wbits != (unsigned)windowBits) {
-        ZFREE_WINDOW(strm, state->window);
-        state->window = NULL;
-    }
 
     /* update state and reset the rest of it */
     state->wrap = wrap;
@@ -134,13 +130,94 @@ int32_t Z_EXPORT PREFIX(inflateReset2)(PREFIX3(stream) *strm, int32_t windowBits
     return PREFIX(inflateReset)(strm);
 }
 
-/* This function is hidden in ZLIB_COMPAT builds. */
+#ifdef INF_ALLOC_DEBUG
+#  include <stdio.h>
+#  define LOGSZ(name,size)           fprintf(stderr, "%s is %d bytes\n", name, size)
+#  define LOGSZP(name,size,loc,pad)  fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad)
+#  define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad)
+#else
+#  define LOGSZ(name,size)
+#  define LOGSZP(name,size,loc,pad)
+#  define LOGSZPL(name,size,loc,pad)
+#endif
+
+/* ===========================================================================
+ * Allocate a big buffer and divide it up into the various buffers inflate needs.
+ * Handles alignment of allocated buffer and alignment of individual buffers.
+ */
+Z_INTERNAL inflate_allocs* alloc_inflate(PREFIX3(stream) *strm) {
+    int curr_size = 0;
+
+    /* Define sizes */
+    int window_size = INFLATE_ADJUST_WINDOW_SIZE((1 << MAX_WBITS) + 64); /* 64B padding for chunksize */
+    int state_size = sizeof(inflate_state);
+    int alloc_size = sizeof(inflate_allocs);
+
+    /* Calculate relative buffer positions and paddings */
+    LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE));
+    int window_pos = PAD_WINDOW(curr_size);
+    curr_size = window_pos + window_size;
+
+    LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int state_pos = PAD_64(curr_size);
+    curr_size = state_pos + state_size;
+
+    LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16));
+    int alloc_pos = PAD_16(curr_size);
+    curr_size = alloc_pos + alloc_size;
+
+    /* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */
+    int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1));
+
+    /* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */
+    char *original_buf = strm->zalloc(strm->opaque, 1, total_size);
+    if (original_buf == NULL)
+        return NULL;
+
+    char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf));
+    LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE));
+
+    /* Initialize alloc_bufs */
+    inflate_allocs *alloc_bufs  = (struct inflate_allocs_s *)(buff + alloc_pos);
+    alloc_bufs->buf_start = (char *)original_buf;
+    alloc_bufs->zfree = strm->zfree;
+
+    alloc_bufs->window =  (unsigned char *)HINT_ALIGNED_WINDOW((buff + window_pos));
+    alloc_bufs->state = (inflate_state *)HINT_ALIGNED_64((buff + state_pos));
+
+#ifdef Z_MEMORY_SANITIZER
+    /* This is _not_ to subvert the memory sanitizer but to instead unposion some
+       data we willingly and purposefully load uninitialized into vector registers
+       in order to safely read the last < chunksize bytes of the window. */
+    __msan_unpoison(alloc_bufs->window + window_size, 64);
+#endif
+
+    return alloc_bufs;
+}
+
+/* ===========================================================================
+ * Free all allocated inflate buffers
+ */
+Z_INTERNAL void free_inflate(PREFIX3(stream) *strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    if (state->alloc_bufs != NULL) {
+        inflate_allocs *alloc_bufs = state->alloc_bufs;
+        alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start);
+        strm->state = NULL;
+    }
+}
+
+/* ===========================================================================
+ * Initialize inflate state and buffers.
+ * This function is hidden in ZLIB_COMPAT builds.
+ */
 int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windowBits) {
     int32_t ret;
     struct inflate_state *state;
 
-    /* Initialize functable earlier. */
-    functable.force_init();
+    /* Initialize functable */
+    FUNCTABLE_INIT;
 
     if (strm == NULL)
         return Z_STREAM_ERROR;
@@ -151,19 +228,23 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windo
     }
     if (strm->zfree == NULL)
         strm->zfree = PREFIX(zcfree);
-    state = ZALLOC_INFLATE_STATE(strm);
-    if (state == NULL)
+
+    inflate_allocs *alloc_bufs = alloc_inflate(strm);
+    if (alloc_bufs == NULL)
         return Z_MEM_ERROR;
+
+    state = alloc_bufs->state;
+    state->window = alloc_bufs->window;
+    state->alloc_bufs = alloc_bufs;
     Tracev((stderr, "inflate: allocated\n"));
+
     strm->state = (struct internal_state *)state;
     state->strm = strm;
-    state->window = NULL;
     state->mode = HEAD;     /* to pass state test in inflateReset2() */
-    state->chunksize = functable.chunksize();
+    state->chunksize = FUNCTABLE_CALL(chunksize)();
     ret = PREFIX(inflateReset2)(strm, windowBits);
     if (ret != Z_OK) {
-        ZFREE_STATE(strm, state);
-        strm->state = NULL;
+        free_inflate(strm);
     }
     return ret;
 }
@@ -222,31 +303,6 @@ void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state) {
     state->distbits = 5;
 }
 
-int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state) {
-    /* if it hasn't been done already, allocate space for the window */
-    if (state->window == NULL) {
-        unsigned wsize = 1U << state->wbits;
-        state->window = (unsigned char *)ZALLOC_WINDOW(state->strm, wsize + state->chunksize, sizeof(unsigned char));
-        if (state->window == NULL)
-            return Z_MEM_ERROR;
-#ifdef Z_MEMORY_SANITIZER
-        /* This is _not_ to subvert the memory sanitizer but to instead unposion some
-           data we willingly and purposefully load uninitialized into vector registers
-           in order to safely read the last < chunksize bytes of the window. */
-        __msan_unpoison(state->window + wsize, state->chunksize);
-#endif
-    }
-
-    /* if window not in use yet, initialize */
-    if (state->wsize == 0) {
-        state->wsize = 1U << state->wbits;
-        state->wnext = 0;
-        state->whave = 0;
-    }
-
-    return Z_OK;
-}
-
 /*
    Update the window with the last wsize (normally 32K) bytes written before
    returning.  If window does not exist yet, create it.  This is only called
@@ -261,20 +317,20 @@ int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state) {
    output will fall in the output data, making match copies simpler and faster.
    The advantage may be dependent on the size of the processor's data caches.
  */
-static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum) {
+static void updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum) {
     struct inflate_state *state;
     uint32_t dist;
 
     state = (struct inflate_state *)strm->state;
 
-    if (PREFIX(inflate_ensure_window)(state)) return 1;
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
 
     /* len state->wsize or less output bytes into the circular window */
     if (len >= state->wsize) {
         /* Only do this if the caller specifies to checksum bytes AND the platform requires
-         * it (s/390 being the primary exception to this. Also, for now, do the adler checksums
-         * if not a gzip based header. The inline adler checksums will come in the near future,
-         * possibly the next commit */
+         * it (s/390 being the primary exception to this) */
         if (INFLATE_NEED_CHECKSUM(strm) && cksum) {
             /* We have to split the checksum over non-copied and copied bytes */
             if (len > state->wsize)
@@ -314,7 +370,6 @@ static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t
                 state->whave += dist;
         }
     }
-    return 0;
 }
 
 /*
@@ -636,7 +691,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             }
             /* compute crc32 checksum if not in raw mode */
             if ((state->wrap & 4) && state->flags)
-                strm->adler = state->check = functable.crc32_fold_reset(&state->crc_fold);
+                strm->adler = state->check = FUNCTABLE_CALL(crc32_fold_reset)(&state->crc_fold);
             state->mode = TYPE;
             break;
 #endif
@@ -867,7 +922,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             /* use inflate_fast() if we have enough input and output */
             if (have >= INFLATE_FAST_MIN_HAVE && left >= INFLATE_FAST_MIN_LEFT) {
                 RESTORE();
-                functable.inflate_fast(strm, out);
+                FUNCTABLE_CALL(inflate_fast)(strm, out);
                 LOAD();
                 if (state->mode == TYPE)
                     state->back = -1;
@@ -1026,7 +1081,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             } else {
                 copy = MIN(state->length, left);
 
-                put = functable.chunkmemset_safe(put, state->offset, copy, left);
+                put = FUNCTABLE_CALL(chunkmemset_safe)(put, state->offset, copy, left);
             }
             left -= copy;
             state->length -= copy;
@@ -1056,7 +1111,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
                     }
 #ifdef GUNZIP
                     if (state->flags)
-                        strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold);
+                        strm->adler = state->check = FUNCTABLE_CALL(crc32_fold_final)(&state->crc_fold);
 #endif
                 }
                 out = left;
@@ -1098,9 +1153,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             ret = Z_DATA_ERROR;
             goto inf_leave;
 
-        case MEM:
-            return Z_MEM_ERROR;
-
         case SYNC:
 
         default:                 /* can't happen, but makes compilers happy */
@@ -1111,7 +1163,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
        Return from inflate(), updating the total counts and the check value.
        If there was no progress during the inflate() call, return a buffer
        error.  Call updatewindow() to create and/or update the window state.
-       Note: a memory error from inflate() is non-recoverable.
      */
   inf_leave:
     RESTORE();
@@ -1120,10 +1171,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             (state->wsize || (out != strm->avail_out && state->mode < BAD &&
                  (state->mode < CHECK || flush != Z_FINISH)))) {
         /* update sliding window with respective checksum if not in "raw" mode */
-        if (updatewindow(strm, strm->next_out, check_bytes, state->wrap & 4)) {
-            state->mode = MEM;
-            return Z_MEM_ERROR;
-        }
+        updatewindow(strm, strm->next_out, check_bytes, state->wrap & 4);
     }
     in -= strm->avail_in;
     out -= strm->avail_out;
@@ -1144,14 +1192,12 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
 }
 
 int32_t Z_EXPORT PREFIX(inflateEnd)(PREFIX3(stream) *strm) {
-    struct inflate_state *state;
     if (inflateStateCheck(strm))
         return Z_STREAM_ERROR;
-    state = (struct inflate_state *)strm->state;
-    if (state->window != NULL)
-        ZFREE_WINDOW(strm, state->window);
-    ZFREE_STATE(strm, strm->state);
-    strm->state = NULL;
+
+    /* Free allocated buffers */
+    free_inflate(strm);
+
     Tracev((stderr, "inflate: end\n"));
     return Z_OK;
 }
@@ -1179,7 +1225,6 @@ int32_t Z_EXPORT PREFIX(inflateGetDictionary)(PREFIX3(stream) *strm, uint8_t *di
 int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8_t *dictionary, uint32_t dictLength) {
     struct inflate_state *state;
     unsigned long dictid;
-    int32_t ret;
 
     /* check state */
     if (inflateStateCheck(strm))
@@ -1190,7 +1235,7 @@ int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8
 
     /* check for correct dictionary identifier */
     if (state->mode == DICT) {
-        dictid = functable.adler32(ADLER32_INITIAL_VALUE, dictionary, dictLength);
+        dictid = FUNCTABLE_CALL(adler32)(ADLER32_INITIAL_VALUE, dictionary, dictLength);
         if (dictid != state->check)
             return Z_DATA_ERROR;
     }
@@ -1199,11 +1244,8 @@ int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8
 
     /* copy dictionary to window using updatewindow(), which will amend the
        existing dictionary if appropriate */
-    ret = updatewindow(strm, dictionary + dictLength, dictLength, 0);
-    if (ret) {
-        state->mode = MEM;
-        return Z_MEM_ERROR;
-    }
+    updatewindow(strm, dictionary + dictLength, dictLength, 0);
+
     state->havedict = 1;
     Tracev((stderr, "inflate:   dictionary set\n"));
     return Z_OK;
@@ -1271,7 +1313,7 @@ int32_t Z_EXPORT PREFIX(inflateSync)(PREFIX3(stream) *strm) {
     /* if first time, start search in bit buffer */
     if (state->mode != SYNC) {
         state->mode = SYNC;
-        state->hold <<= state->bits & 7;
+        state->hold >>= state->bits & 7;
         state->bits -= state->bits & 7;
         len = 0;
         while (state->bits >= 8) {
@@ -1334,30 +1376,28 @@ int32_t Z_EXPORT PREFIX(inflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou
         return Z_STREAM_ERROR;
     state = (struct inflate_state *)source->state;
 
+    /* copy stream */
+    memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
+
     /* allocate space */
-    copy = ZALLOC_INFLATE_STATE(source);
-    if (copy == NULL)
+    inflate_allocs *alloc_bufs = alloc_inflate(dest);
+    if (alloc_bufs == NULL)
         return Z_MEM_ERROR;
+    copy = alloc_bufs->state;
 
     /* copy state */
-    memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
-    ZCOPY_INFLATE_STATE(copy, state);
+    memcpy(copy, state, sizeof(struct inflate_state));
     copy->strm = dest;
     if (state->lencode >= state->codes && state->lencode <= state->codes + ENOUGH - 1) {
         copy->lencode = copy->codes + (state->lencode - state->codes);
         copy->distcode = copy->codes + (state->distcode - state->codes);
     }
     copy->next = copy->codes + (state->next - state->codes);
+    copy->window = alloc_bufs->window;
+    copy->alloc_bufs = alloc_bufs;
 
     /* window */
-    copy->window = NULL;
-    if (state->window != NULL) {
-        if (PREFIX(inflate_ensure_window)(copy)) {
-            ZFREE_STATE(source, copy);
-            return Z_MEM_ERROR;
-        }
-        ZCOPY_WINDOW(copy->window, state->window, (size_t)state->wsize);
-    }
+    memcpy(copy->window, state->window, INFLATE_ADJUST_WINDOW_SIZE((size_t)state->wsize));
 
     dest->state = (struct internal_state *)copy;
     return Z_OK;
diff --git a/3rdparty/zlib-ng/inflate.h b/3rdparty/zlib-ng/inflate.h
index 39cdf5d683..536da7d1f8 100644
--- a/3rdparty/zlib-ng/inflate.h
+++ b/3rdparty/zlib-ng/inflate.h
@@ -11,8 +11,12 @@
 #ifndef INFLATE_H_
 #define INFLATE_H_
 
-#include "adler32_fold.h"
-#include "crc32_fold.h"
+#include "crc32.h"
+
+#ifdef S390_DFLTCC_INFLATE
+#  include "arch/s390/dfltcc_common.h"
+#  define HAVE_ARCH_INFLATE_STATE
+#endif
 
 /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
    NO_GZIP would be used to avoid linking in the crc code when it is not needed.
@@ -53,14 +57,13 @@ typedef enum {
     LENGTH,     /* i: waiting for 32-bit length (gzip) */
     DONE,       /* finished check, done -- remain here until reset */
     BAD,        /* got a data error -- remain here until reset */
-    MEM,        /* got an inflate() memory error -- remain here until reset */
     SYNC        /* looking for synchronization bytes to restart inflate() */
 } inflate_mode;
 
 /*
     State transitions between above modes -
 
-    (most modes can go to BAD or MEM on error -- not shown for clarity)
+    (most modes can go to BAD on error -- not shown for clarity)
 
     Process header:
         HEAD -> (gzip) or (zlib) or (raw)
@@ -81,10 +84,19 @@ typedef enum {
     Process trailer:
         CHECK -> LENGTH -> DONE
  */
+typedef struct inflate_state inflate_state;
+
+/* Struct for memory allocation handling */
+typedef struct inflate_allocs_s {
+    char            *buf_start;
+    free_func        zfree;
+    inflate_state   *state;
+    unsigned char   *window;
+} inflate_allocs;
 
 /* State maintained between inflate() calls -- approximately 7K bytes, not
    including the allocated sliding window, which is up to 32K bytes. */
-struct inflate_state {
+struct ALIGNED_(64) inflate_state {
     PREFIX3(stream) *strm;             /* pointer back to this zlib stream */
     inflate_mode mode;          /* current inflate mode */
     int last;                   /* true if processing last block */
@@ -132,9 +144,14 @@ struct inflate_state {
     int back;                   /* bits back of last unprocessed length/lit */
     unsigned was;               /* initial length of match */
     uint32_t chunksize;         /* size of memory copying chunk */
+    inflate_allocs *alloc_bufs; /* struct for handling memory allocations */
+#ifdef HAVE_ARCH_INFLATE_STATE
+    arch_inflate_state arch;    /* architecture-specific extensions */
+#endif
 };
 
-int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state);
 void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state);
+Z_INTERNAL inflate_allocs* alloc_inflate(PREFIX3(stream) *strm);
+Z_INTERNAL void free_inflate(PREFIX3(stream) *strm);
 
 #endif /* INFLATE_H_ */
diff --git a/3rdparty/zlib-ng/inflate_p.h b/3rdparty/zlib-ng/inflate_p.h
index eff73876da..c324b0486a 100644
--- a/3rdparty/zlib-ng/inflate_p.h
+++ b/3rdparty/zlib-ng/inflate_p.h
@@ -10,15 +10,16 @@
 /* Architecture-specific hooks. */
 #ifdef S390_DFLTCC_INFLATE
 #  include "arch/s390/dfltcc_inflate.h"
+/* DFLTCC instructions require window to be page-aligned */
+#  define PAD_WINDOW            PAD_4096
+#  define WINDOW_PAD_SIZE       4096
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_4096
 #else
-/* Memory management for the inflate state. Useful for allocating arch-specific extension blocks. */
-#  define ZALLOC_INFLATE_STATE(strm) ((struct inflate_state *)ZALLOC(strm, 1, sizeof(struct inflate_state)))
-#  define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
-#  define ZCOPY_INFLATE_STATE(dst, src) memcpy(dst, src, sizeof(struct inflate_state))
-/* Memory management for the window. Useful for allocation the aligned window. */
-#  define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
-#  define ZCOPY_WINDOW(dest, src, n) memcpy(dest, src, n)
-#  define ZFREE_WINDOW(strm, addr) ZFREE(strm, addr)
+#  define PAD_WINDOW            PAD_64
+#  define WINDOW_PAD_SIZE       64
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_64
+/* Adjust the window size for the arch-specific inflate code. */
+#  define INFLATE_ADJUST_WINDOW_SIZE(n) (n)
 /* Invoked at the end of inflateResetKeep(). Useful for initializing arch-specific extension blocks. */
 #  define INFLATE_RESET_KEEP_HOOK(strm) do {} while (0)
 /* Invoked at the beginning of inflatePrime(). Useful for updating arch-specific buffers. */
@@ -46,9 +47,9 @@
 /* check function to use adler32() for zlib or crc32() for gzip */
 #ifdef GUNZIP
 #  define UPDATE(check, buf, len) \
-    (state->flags ? PREFIX(crc32)(check, buf, len) : functable.adler32(check, buf, len))
+    (state->flags ? PREFIX(crc32)(check, buf, len) : FUNCTABLE_CALL(adler32)(check, buf, len))
 #else
-#  define UPDATE(check, buf, len) functable.adler32(check, buf, len)
+#  define UPDATE(check, buf, len) FUNCTABLE_CALL(adler32)(check, buf, len)
 #endif
 
 /* check macros for header crc */
diff --git a/3rdparty/zlib-ng/inftrees.c b/3rdparty/zlib-ng/inftrees.c
index 423f7b461d..5234fe7ae0 100644
--- a/3rdparty/zlib-ng/inftrees.c
+++ b/3rdparty/zlib-ng/inftrees.c
@@ -1,5 +1,5 @@
 /* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-2023 Mark Adler
+ * Copyright (C) 1995-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -7,7 +7,7 @@
 #include "zutil.h"
 #include "inftrees.h"
 
-const char PREFIX(inflate_copyright)[] = " inflate 1.3.0 Copyright 1995-2023 Mark Adler ";
+const char PREFIX(inflate_copyright)[] = " inflate 1.3.1 Copyright 1995-2024 Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -54,7 +54,7 @@ int Z_INTERNAL zng_inflate_table(codetype type, uint16_t *lens, unsigned codes,
         35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
     static const uint16_t lext[31] = { /* Length codes 257..285 extra */
         16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
-        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 77, 202};
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 203, 77};
     static const uint16_t dbase[32] = { /* Distance codes 0..29 base */
         1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
         257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
diff --git a/3rdparty/zlib-ng/insert_string.c b/3rdparty/zlib-ng/insert_string.c
index cfe39837f8..11a5b97ffe 100644
--- a/3rdparty/zlib-ng/insert_string.c
+++ b/3rdparty/zlib-ng/insert_string.c
@@ -1,6 +1,6 @@
 /* insert_string.c -- insert_string integer hash variant
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  *
  */
@@ -10,12 +10,12 @@
 
 #define HASH_SLIDE           16
 
-#define HASH_CALC(s, h, val) h = ((val * 2654435761U) >> HASH_SLIDE);
+#define HASH_CALC(h, val)    h = ((val * 2654435761U) >> HASH_SLIDE);
 #define HASH_CALC_VAR        h
 #define HASH_CALC_VAR_INIT   uint32_t h = 0
 
-#define UPDATE_HASH          update_hash_c
-#define INSERT_STRING        insert_string_c
-#define QUICK_INSERT_STRING  quick_insert_string_c
+#define UPDATE_HASH          update_hash
+#define INSERT_STRING        insert_string
+#define QUICK_INSERT_STRING  quick_insert_string
 
 #include "insert_string_tpl.h"
diff --git a/3rdparty/zlib-ng/insert_string_roll.c b/3rdparty/zlib-ng/insert_string_roll.c
index dfea347bcc..8693f96f59 100644
--- a/3rdparty/zlib-ng/insert_string_roll.c
+++ b/3rdparty/zlib-ng/insert_string_roll.c
@@ -1,6 +1,6 @@
 /* insert_string_roll.c -- insert_string rolling hash variant
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  *
  */
@@ -10,7 +10,7 @@
 
 #define HASH_SLIDE           5
 
-#define HASH_CALC(s, h, val) h = ((h << HASH_SLIDE) ^ ((uint8_t)val))
+#define HASH_CALC(h, val)    h = ((h << HASH_SLIDE) ^ ((uint8_t)val))
 #define HASH_CALC_VAR        s->ins_h
 #define HASH_CALC_VAR_INIT
 #define HASH_CALC_READ       val = strstart[0]
diff --git a/3rdparty/zlib-ng/insert_string_tpl.h b/3rdparty/zlib-ng/insert_string_tpl.h
index c84617730a..281c013463 100644
--- a/3rdparty/zlib-ng/insert_string_tpl.h
+++ b/3rdparty/zlib-ng/insert_string_tpl.h
@@ -1,10 +1,10 @@
 #ifndef INSERT_STRING_H_
 #define INSERT_STRING_H_
 
-/* insert_string.h -- Private insert_string functions shared with more than
- *                    one insert string implementation
+/* insert_string_tpl.h -- Private insert_string functions shared with more than
+ *                        one insert string implementation
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  *
  * Copyright (C) 2013 Intel Corporation. All rights reserved.
  * Authors:
@@ -47,9 +47,8 @@
  *    input characters, so that a running hash key can be computed from the
  *    previous key instead of complete recalculation each time.
  */
-Z_INTERNAL uint32_t UPDATE_HASH(deflate_state *const s, uint32_t h, uint32_t val) {
-    (void)s;
-    HASH_CALC(s, h, val);
+Z_INTERNAL uint32_t UPDATE_HASH(uint32_t h, uint32_t val) {
+    HASH_CALC(h, val);
     return h & HASH_CALC_MASK;
 }
 
@@ -65,7 +64,7 @@ Z_INTERNAL Pos QUICK_INSERT_STRING(deflate_state *const s, uint32_t str) {
 
     HASH_CALC_VAR_INIT;
     HASH_CALC_READ;
-    HASH_CALC(s, HASH_CALC_VAR, val);
+    HASH_CALC(HASH_CALC_VAR, val);
     HASH_CALC_VAR &= HASH_CALC_MASK;
     hm = HASH_CALC_VAR;
 
@@ -94,7 +93,7 @@ Z_INTERNAL void INSERT_STRING(deflate_state *const s, uint32_t str, uint32_t cou
 
         HASH_CALC_VAR_INIT;
         HASH_CALC_READ;
-        HASH_CALC(s, HASH_CALC_VAR, val);
+        HASH_CALC(HASH_CALC_VAR, val);
         HASH_CALC_VAR &= HASH_CALC_MASK;
         hm = HASH_CALC_VAR;
 
diff --git a/3rdparty/zlib-ng/match_tpl.h b/3rdparty/zlib-ng/match_tpl.h
index d076798520..9c258242cd 100644
--- a/3rdparty/zlib-ng/match_tpl.h
+++ b/3rdparty/zlib-ng/match_tpl.h
@@ -1,6 +1,6 @@
 /* match_tpl.h -- find longest match template for compare256 variants
  *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  *
  * Portions copyright (C) 2014-2021 Konstantin Nosov
@@ -8,11 +8,6 @@
  *  https://github.com/gildor2/fast_zlib
  */
 
-#include "zbuild.h"
-#include "zutil_p.h"
-#include "deflate.h"
-#include "functable.h"
-
 #ifndef MATCH_TPL_H
 #define MATCH_TPL_H
 
@@ -107,11 +102,11 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
          * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because
          * these strings are not yet inserted into the hash table.
          */
-        hash = s->update_hash(s, 0, scan[1]);
-        hash = s->update_hash(s, hash, scan[2]);
+        hash = s->update_hash(0, scan[1]);
+        hash = s->update_hash(hash, scan[2]);
 
         for (i = 3; i <= best_len; i++) {
-            hash = s->update_hash(s, hash, scan[i]);
+            hash = s->update_hash(hash, scan[i]);
 
             /* If we're starting with best_len >= 3, we can use offset search. */
             pos = s->head[hash];
@@ -241,9 +236,9 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
                  */
                 scan_endstr = scan + len - (STD_MIN_MATCH+1);
 
-                hash = s->update_hash(s, 0, scan_endstr[0]);
-                hash = s->update_hash(s, hash, scan_endstr[1]);
-                hash = s->update_hash(s, hash, scan_endstr[2]);
+                hash = s->update_hash(0, scan_endstr[0]);
+                hash = s->update_hash(hash, scan_endstr[1]);
+                hash = s->update_hash(hash, scan_endstr[2]);
 
                 pos = s->head[hash];
                 if (pos < cur_match) {
diff --git a/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch
new file mode 100644
index 0000000000..237770d204
--- /dev/null
+++ b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch
@@ -0,0 +1,13 @@
+diff --git a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
+index 14f82fc..78e46e1 100644
+--- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
++++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
+@@ -66,7 +66,7 @@ macro(check_armv6_compiler_flag)
+             return __uqsub16(a, b);
+         #endif
+         }
+-        int main(void) { return 0; }"
++        int main(void) { return f(1,2); }"
+         HAVE_ARMV6_INTRIN
+     )
+     set(CMAKE_REQUIRED_FLAGS)
diff --git a/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch
new file mode 100644
index 0000000000..fb3699dcff
--- /dev/null
+++ b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch
@@ -0,0 +1,148 @@
+--- ./CMakeLists.txt	2024-09-11 12:28:30.597680661 +0300
++++ ../../../zlib-ng/CMakeLists.txt	2024-09-11 12:29:10.013644583 +0300
+@@ -74,10 +74,10 @@
+ # Options parsing
+ #
+ option(WITH_GZFILEOP "Compile with support for gzFile related functions" ON)
+-option(ZLIB_COMPAT "Compile with zlib compatible API" ON)
+-option(ZLIB_ENABLE_TESTS "Build test binaries" OFF)
+-option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" OFF)
+-option(WITH_GTEST "Build gtest_zlib" OFF)
++option(ZLIB_COMPAT "Compile with zlib compatible API" OFF)
++option(ZLIB_ENABLE_TESTS "Build test binaries" ON)
++option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" ON)
++option(WITH_GTEST "Build gtest_zlib" ON)
+ option(WITH_FUZZERS "Build test/fuzz" OFF)
+ option(WITH_BENCHMARKS "Build test/benchmarks" OFF)
+ option(WITH_BENCHMARK_APPS "Build application benchmarks" OFF)
+@@ -128,11 +128,6 @@
+ 
+ option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF)
+ 
+-set(ZLIB_BUILD_SHARED_LIBS OFF)
+-set(SKIP_INSTALL_ALL ON)
+-ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes -Wmissing-declarations -Wundef -Wstrict-prototypes -Wtype-limits)
+-ocv_warnings_disable(CMAKE_C_FLAGS /wd4819 /wd4244 /wd4334)
+-
+ mark_as_advanced(FORCE
+     ZLIB_SYMBOL_PREFIX
+     WITH_REDUCED_MEM
+@@ -1147,22 +1142,21 @@
+     list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
+ endif()
+ 
+-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
++if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+     set(ZLIB_DLL_SRCS win32/zlib${SUFFIX}1.rc)
+ endif()
+ 
+-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS)
++if(NOT DEFINED BUILD_SHARED_LIBS)
+     add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS})
+     add_library(zlibstatic STATIC ${ZLIB_ALL_SRCS})
+ 
+     set(ZLIB_INSTALL_LIBRARIES zlib zlibstatic)
+ else()
++    add_library(zlib ${ZLIB_ALL_SRCS})
+ 
+-    if(ZLIB_BUILD_SHARED_LIBS)
+-        add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS})
++    if(BUILD_SHARED_LIBS)
+         target_sources(zlib PRIVATE ${ZLIB_DLL_SRCS})
+     else()
+-        add_library(zlib STATIC ${ZLIB_ALL_SRCS})
+         add_library(zlibstatic ALIAS zlib)
+     endif()
+ 
+@@ -1195,17 +1189,17 @@
+ 
+ if(WIN32)
+     # Shared library
+-    if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
++    if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+         set_target_properties(zlib PROPERTIES OUTPUT_NAME zlib${SUFFIX})
+     endif()
+     # Static library
+-    if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS)
++    if(NOT DEFINED BUILD_SHARED_LIBS)
+         if(MSVC)
+             set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX})
+         else()
+             set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z${SUFFIX})
+         endif()
+-    elseif(NOT ZLIB_BUILD_SHARED_LIBS)
++    elseif(NOT BUILD_SHARED_LIBS)
+         if(MSVC)
+             set_target_properties(zlib PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX})
+         else()
+@@ -1217,7 +1211,7 @@
+     set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES OUTPUT_NAME z${SUFFIX})
+ endif()
+ 
+-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS)
++if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+     set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
+ 
+     if(ZLIB_COMPAT)
+@@ -1277,6 +1271,8 @@
+ if(WITH_GZFILEOP)
+     set(PKG_CONFIG_CFLAGS "-DWITH_GZFILEOP")
+ endif()
++configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.pc.cmakein
++    ${ZLIB_PC} @ONLY)
+ configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein
+     ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h @ONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.h.in
+@@ -1326,6 +1322,17 @@
+        set(PACKAGE_CONFIGNAME zlib-ng)
+        set(PACKAGE_VERSION ${ZLIBNG_HEADER_VERSION})
+     endif()
++    configure_package_config_file(${PACKAGE_CONFIGNAME}-config.cmake.in
++        ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config.cmake
++        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME}
++        PATH_VARS INCLUDE_INSTALL_DIR LIB_INSTALL_DIR)
++    write_basic_package_version_file(
++        ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config-version.cmake
++        VERSION ${PACKAGE_VERSION}
++        COMPATIBILITY AnyNewerVersion)
++    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config.cmake
++                  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config-version.cmake
++        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME})
+ endif()
+ 
+ #============================================================================
+@@ -1335,7 +1342,7 @@
+ if(ZLIB_ENABLE_TESTS)
+     enable_testing()
+ 
+-    if(ZLIB_BUILD_SHARED_LIBS)
++    if(BUILD_SHARED_LIBS)
+         if(ZLIBNG_ENABLE_TESTS)
+             message(STATUS "Disabling zlib-ng tests because shared libraries are enabled")
+             set(ZLIBNG_ENABLE_TESTS OFF)
+@@ -1399,12 +1406,19 @@
+ 
+ FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES)
+ 
+-if(ENABLE_SOLUTION_FOLDERS)
+-  set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES FOLDER "3rdparty")
+-endif()
++#============================================================================
++# CPack
++#============================================================================
++set(CPACK_GENERATOR "TGZ")
++set(CPACK_SOURCE_GENERATOR "TGZ")
++set(CPACK_SOURCE_IGNORE_FILES .git/ _CPack_Packages/ "${PROJECT_BINARY_DIR}/")
++
++set(CPACK_PACKAGE_NAME "zlib${SUFFIX}")
++set(CPACK_PACKAGE_VERSION ${ZLIB_FULL_VERSION})
++set(CPACK_PACKAGE_DIRECTORY "${PROJECT_BINARY_DIR}/package")
+ 
+-if(NOT BUILD_SHARED_LIBS)
+-  ocv_install_target(${ZLIB_INSTALL_LIBRARIES} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
++if("${PROJECT_BINARY_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}")
++    message(WARNING "Building to source folder is not recommended. Cpack will be unable to generate source package.")
+ endif()
+ 
+-ocv_install_3rdparty_licenses(${ZLIB_INSTALL_LIBRARIES} LICENSE.md)
++include(CPack)
diff --git a/3rdparty/zlib-ng/trees.c b/3rdparty/zlib-ng/trees.c
index 5bb88389ba..9f2f49137f 100644
--- a/3rdparty/zlib-ng/trees.c
+++ b/3rdparty/zlib-ng/trees.c
@@ -1,5 +1,5 @@
 /* trees.c -- output deflated data using Huffman coding
- * Copyright (C) 1995-2021 Jean-loup Gailly
+ * Copyright (C) 1995-2024 Jean-loup Gailly
  * detect_data_type() function provided freely by Cosmin Truta, 2006
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
@@ -75,7 +75,6 @@ static int  build_bl_tree    (deflate_state *s);
 static void send_all_trees   (deflate_state *s, int lcodes, int dcodes, int blcodes);
 static void compress_block   (deflate_state *s, const ct_data *ltree, const ct_data *dtree);
 static int  detect_data_type (deflate_state *s);
-static void bi_flush         (deflate_state *s);
 
 /* ===========================================================================
  * Initialize the tree data structures for a new zlib stream.
@@ -609,13 +608,6 @@ void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored
     }
 }
 
-/* ===========================================================================
- * Flush the bits in the bit buffer to pending output (leaves at most 7 bits)
- */
-void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) {
-    bi_flush(s);
-}
-
 /* ===========================================================================
  * Send one empty static block to give enough lookahead for inflate.
  * This takes 10 bits, of which 7 may remain in the bit buffer.
@@ -623,7 +615,7 @@ void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) {
 void Z_INTERNAL zng_tr_align(deflate_state *s) {
     zng_tr_emit_tree(s, STATIC_TREES, 0);
     zng_tr_emit_end_block(s, static_ltree, 0);
-    bi_flush(s);
+    zng_tr_flush_bits(s);
 }
 
 /* ===========================================================================
@@ -718,21 +710,30 @@ static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data
     /* dtree: distance tree */
     unsigned dist;      /* distance of matched string */
     int lc;             /* match length or unmatched char (if dist == 0) */
-    unsigned sx = 0;    /* running index in sym_buf */
+    unsigned sx = 0;    /* running index in symbol buffers */
 
     if (s->sym_next != 0) {
         do {
+#ifdef LIT_MEM
+            dist = s->d_buf[sx];
+            lc = s->l_buf[sx++];
+#else
             dist = s->sym_buf[sx++] & 0xff;
             dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
             lc = s->sym_buf[sx++];
+#endif
             if (dist == 0) {
                 zng_emit_lit(s, ltree, lc);
             } else {
                 zng_emit_dist(s, ltree, dtree, lc, dist);
             } /* literal or match pair ? */
 
-            /* Check that the overlay between pending_buf and sym_buf is ok: */
+            /* Check for no overlay of pending_buf on needed symbols */
+#ifdef LIT_MEM
+            Assert(s->pending < 2 * (s->lit_bufsize + sx), "pending_buf overflow");
+#else
             Assert(s->pending < s->lit_bufsize + sx, "pending_buf overflow");
+#endif
         } while (sx < s->sym_next);
     }
 
@@ -781,27 +782,26 @@ static int detect_data_type(deflate_state *s) {
 /* ===========================================================================
  * Flush the bit buffer, keeping at most 7 bits in it.
  */
-static void bi_flush(deflate_state *s) {
-    if (s->bi_valid == 64) {
-        put_uint64(s, s->bi_buf);
-        s->bi_buf = 0;
-        s->bi_valid = 0;
-    } else {
-        if (s->bi_valid >= 32) {
-            put_uint32(s, (uint32_t)s->bi_buf);
-            s->bi_buf >>= 32;
-            s->bi_valid -= 32;
-        }
-        if (s->bi_valid >= 16) {
-            put_short(s, (uint16_t)s->bi_buf);
-            s->bi_buf >>= 16;
-            s->bi_valid -= 16;
-        }
-        if (s->bi_valid >= 8) {
-            put_byte(s, s->bi_buf);
-            s->bi_buf >>= 8;
-            s->bi_valid -= 8;
-        }
+void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) {
+    if (s->bi_valid >= 48) {
+        put_uint32(s, (uint32_t)s->bi_buf);
+        put_short(s, (uint16_t)(s->bi_buf >> 32));
+        s->bi_buf >>= 48;
+        s->bi_valid -= 48;
+    } else if (s->bi_valid >= 32) {
+        put_uint32(s, (uint32_t)s->bi_buf);
+        s->bi_buf >>= 32;
+        s->bi_valid -= 32;
+    }
+    if (s->bi_valid >= 16) {
+        put_short(s, (uint16_t)s->bi_buf);
+        s->bi_buf >>= 16;
+        s->bi_valid -= 16;
+    }
+    if (s->bi_valid >= 8) {
+        put_byte(s, s->bi_buf);
+        s->bi_buf >>= 8;
+        s->bi_valid -= 8;
     }
 }
 
diff --git a/3rdparty/zlib-ng/win32/Makefile.a64 b/3rdparty/zlib-ng/win32/Makefile.a64
new file mode 100644
index 0000000000..9f8d6fb7fa
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/Makefile.a64
@@ -0,0 +1,252 @@
+# Makefile for zlib using Microsoft (Visual) C
+# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+#
+# Usage:
+#   nmake -f win32/Makefile.a64                          (standard build)
+#   nmake -f win32/Makefile.a64 LOC=-DFOO                (nonstandard build)
+
+# The toplevel directory of the source tree.
+#
+TOP = .
+
+# optional build flags
+LOC =
+
+# variables
+STATICLIB = zlib.lib
+SHAREDLIB = zlib1.dll
+IMPLIB    = zdll.lib
+SYMBOL_PREFIX =
+
+CC = cl
+LD = link
+AR = lib
+RC = rc
+CP = copy /y
+INCLUDES = -I$(TOP) -I$(TOP)/arch/arm -I$(TOP)/arch/generic
+CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES)
+WFLAGS  = \
+	-D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+	-D_CRT_SECURE_NO_DEPRECATE \
+	-D_CRT_NONSTDC_NO_DEPRECATE \
+	-DARM_FEATURES \
+	-DARM_NEON_HASLD4 \
+	#
+LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
+ARFLAGS = -nologo
+RCFLAGS = /dARM64 /r
+DEFFILE = zlib.def
+RCFILE = zlib1.rc
+RESFILE = zlib1.res
+WITH_GZFILEOP = yes
+ZLIB_COMPAT =
+SUFFIX =
+
+OBJS = \
+	adler32.obj \
+	adler32_c.obj \
+	adler32_fold_c.obj \
+	arm_features.obj \
+	chunkset_c.obj \
+	compare256_c.obj \
+	compress.obj \
+	cpu_features.obj \
+	crc32.obj \
+	crc32_braid_c.obj \
+	crc32_braid_comb.obj \
+	crc32_fold_c.obj \
+	deflate.obj \
+	deflate_fast.obj \
+	deflate_huff.obj \
+	deflate_medium.obj \
+	deflate_quick.obj \
+	deflate_rle.obj \
+	deflate_slow.obj \
+	deflate_stored.obj \
+	functable.obj \
+	infback.obj \
+	inflate.obj \
+	inftrees.obj \
+	insert_string.obj \
+	insert_string_roll.obj \
+	slide_hash_c.obj \
+	trees.obj \
+	uncompr.obj \
+	zutil.obj \
+	#
+!if "$(ZLIB_COMPAT)" != ""
+WITH_GZFILEOP = yes
+WFLAGS = $(WFLAGS) -DZLIB_COMPAT
+DEFFILE = zlibcompat.def
+!else
+STATICLIB = zlib-ng.lib
+SHAREDLIB = zlib-ng1.dll
+IMPLIB = zngdll.lib
+DEFFILE = zlib-ng.def
+RCFILE = zlib-ng1.rc
+RESFILE = zlib-ng1.res
+SUFFIX = -ng
+!endif
+
+!if "$(WITH_GZFILEOP)" != ""
+WFLAGS = $(WFLAGS) -DWITH_GZFILEOP
+OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj
+!endif
+
+WFLAGS = $(WFLAGS) \
+	-DARM_ACLE \
+	-D__ARM_NEON__=1 \
+	-DARM_NEON \
+	-DARM_NOCHECK_NEON \
+	#
+OBJS = $(OBJS) crc32_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
+
+# targets
+all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
+     example.exe minigzip.exe example_d.exe minigzip_d.exe
+
+!if "$(SYMBOL_PREFIX)" != ""
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+!else
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty
+	$(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h
+!endif
+
+zlib$(SUFFIX).h: zlib$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+gzread.c: gzread.c.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h
+	$(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h
+
+$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in
+	cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+$(STATICLIB): zconf $(OBJS)
+	$(AR) $(ARFLAGS) -out:$@ $(OBJS)
+
+$(IMPLIB): $(SHAREDLIB)
+
+$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE)
+	$(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \
+	  -out:$@ -base:0x55A4C0000 $(OBJS) $(RESFILE)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;2
+
+example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+{$(TOP)}.c.obj:
+	$(CC) -c $(WFLAGS) $(CFLAGS) $<
+
+gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c
+
+gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c
+
+gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c
+
+{$(TOP)/arch/arm}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/arch/generic}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/test}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $<
+
+$(TOP)/zconf$(SUFFIX).h: zconf
+
+adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h
+chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h
+compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h
+compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h
+cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h
+crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h
+deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h
+deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h
+deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/arm/arm_features.h $(TOP)/arch_functions.h
+gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h
+inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h
+inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h
+insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h
+slide_hash_neon.obj: $(TOP)/arch/arm/slide_hash_neon.c $(TOP)/arch/arm/neon_intrins.h $(TOP)/zbuild.h $(TOP)/deflate.h
+trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h
+uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h
+zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h
+
+$(RESFILE): $(TOP)/win32/$(RCFILE)
+	$(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE)
+
+# testing
+test: example.exe minigzip.exe
+	example
+	echo hello world | minigzip | minigzip -d
+
+testdll: example_d.exe minigzip_d.exe
+	example_d
+	echo hello world | minigzip_d | minigzip_d -d
+
+example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h
+
+minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h
+
+
+# cleanup
+clean:
+	-del $(STATICLIB)
+	-del $(SHAREDLIB)
+	-del $(IMPLIB)
+	-del *.obj
+	-del *.res
+	-del *.exp
+	-del *.exe
+	-del *.pdb
+	-del *.manifest
+
+distclean: clean
+	-del zconf$(SUFFIX).h
+	-del zlib$(SUFFIX).h
+	-del zlib_name_mangling$(SUFFIX).h
+	-del $(TOP)\win32\zlib.def
+	-del $(TOP)\win32\zlibcompat.def
+	-del $(TOP)\win32\zlib-ng.def
+	-del gzread.c
diff --git a/3rdparty/zlib-ng/win32/Makefile.arm b/3rdparty/zlib-ng/win32/Makefile.arm
new file mode 100644
index 0000000000..cab999dfe0
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/Makefile.arm
@@ -0,0 +1,272 @@
+# Makefile for zlib using Microsoft (Visual) C
+# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+#
+# Usage:
+#   nmake -f win32/Makefile.arm                          (standard build)
+#   nmake -f win32/Makefile.arm LOC=-DFOO                (nonstandard build)
+
+# The toplevel directory of the source tree.
+#
+TOP = .
+
+# optional build flags
+LOC =
+
+# variables
+STATICLIB = zlib.lib
+SHAREDLIB = zlib1.dll
+IMPLIB    = zdll.lib
+SYMBOL_PREFIX =
+
+CC = cl
+LD = link
+AR = lib
+RC = rc
+CP = copy /y
+INCLUDES = -I$(TOP) -I$(TOP)/arch/arm -I$(TOP)/arch/generic
+CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES)
+WFLAGS  = \
+	-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+	-D_CRT_SECURE_NO_DEPRECATE \
+	-D_CRT_NONSTDC_NO_DEPRECATE \
+	-DARM_FEATURES \
+	-DARM_NEON_HASLD4 \
+	#
+LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
+ARFLAGS = -nologo
+RCFLAGS = /dARM /r
+DEFFILE = zlib.def
+RCFILE = zlib1.rc
+RESFILE = zlib1.res
+WITH_GZFILEOP = yes
+ZLIB_COMPAT =
+WITH_ACLE =
+WITH_NEON =
+WITH_ARMV6 =
+WITH_VFPV3 =
+NEON_ARCH = /arch:VFPv4
+SUFFIX =
+
+OBJS = \
+	adler32.obj \
+	adler32_c.obj \
+	adler32_fold_c.obj \
+	arm_features.obj \
+	chunkset_c.obj \
+	compare256_c.obj \
+	compress.obj \
+	cpu_features.obj \
+	crc32.obj \
+	crc32_braid_c.obj \
+	crc32_braid_comb.obj \
+	crc32_fold_c.obj \
+	deflate.obj \
+	deflate_fast.obj \
+	deflate_huff.obj \
+	deflate_medium.obj \
+	deflate_quick.obj \
+	deflate_rle.obj \
+	deflate_slow.obj \
+	deflate_stored.obj \
+	functable.obj \
+	infback.obj \
+	inflate.obj \
+	inftrees.obj \
+	insert_string.obj \
+	insert_string_roll.obj \
+	slide_hash_c.obj \
+	trees.obj \
+	uncompr.obj \
+	zutil.obj \
+	#
+!if "$(ZLIB_COMPAT)" != ""
+WITH_GZFILEOP = yes
+WFLAGS = $(WFLAGS) -DZLIB_COMPAT
+DEFFILE = zlibcompat.def
+!else
+STATICLIB = zlib-ng.lib
+SHAREDLIB = zlib-ng1.dll
+IMPLIB = zngdll.lib
+DEFFILE = zlib-ng.def
+RCFILE = zlib-ng1.rc
+RESFILE = zlib-ng1.res
+SUFFIX = -ng
+!endif
+
+!if "$(WITH_GZFILEOP)" != ""
+WFLAGS = $(WFLAGS) -DWITH_GZFILEOP
+OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj
+!endif
+
+!if "$(WITH_ACLE)" != ""
+WFLAGS = $(WFLAGS) -DARM_ACLE
+OBJS = $(OBJS) crc32_acle.obj
+!endif
+!if "$(WITH_VFPV3)" != ""
+NEON_ARCH = /arch:VFPv3
+!endif
+!if "$(WITH_NEON)" != ""
+CFLAGS = $(CFLAGS) $(NEON_ARCH)
+WFLAGS = $(WFLAGS) \
+	-D__ARM_NEON__=1 \
+	-DARM_NEON \
+	-DARM_NOCHECK_NEON \
+	#
+OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
+!endif
+!if "$(WITH_ARMV6)" != ""
+WFLAGS = $(WFLAGS) \
+	-DARM_SIMD \
+	-DARM_NOCHECK_SIMD \
+	#
+OBJS = $(OBJS) slide_hash_armv6.obj
+!endif
+
+# targets
+all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
+     example.exe minigzip.exe example_d.exe minigzip_d.exe
+
+!if "$(SYMBOL_PREFIX)" != ""
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+!else
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty
+	$(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h
+!endif
+
+zlib$(SUFFIX).h: zlib$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+gzread.c: gzread.c.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h
+	$(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h
+
+$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in
+	cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+$(STATICLIB): zconf $(OBJS)
+	$(AR) $(ARFLAGS) -out:$@ $(OBJS)
+
+$(IMPLIB): $(SHAREDLIB)
+
+$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE)
+	$(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \
+	  -out:$@ -base:0x5A4C0000 $(OBJS) $(RESFILE)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;2
+
+example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+{$(TOP)}.c.obj:
+	$(CC) -c $(WFLAGS) $(CFLAGS) $<
+
+gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c
+
+gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c
+
+gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c
+
+{$(TOP)/arch/arm}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/arch/generic}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/test}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $<
+
+$(TOP)/zconf$(SUFFIX).h: zconf
+
+adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h
+chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h
+compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h
+compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h
+cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h
+crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h
+deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h
+deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h
+deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/arm/arm_features.h $(TOP)/arch_functions.h
+gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h
+inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h
+inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h
+insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h
+trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h
+uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h
+zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h
+
+$(RESFILE): $(TOP)/win32/$(RCFILE)
+	$(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE)
+
+# testing
+test: example.exe minigzip.exe
+	example
+	echo hello world | minigzip | minigzip -d
+
+testdll: example_d.exe minigzip_d.exe
+	example_d
+	echo hello world | minigzip_d | minigzip_d -d
+
+example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h
+
+minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h
+
+
+# cleanup
+clean:
+	-del $(STATICLIB)
+	-del $(SHAREDLIB)
+	-del $(IMPLIB)
+	-del *.obj
+	-del *.res
+	-del *.exp
+	-del *.exe
+	-del *.pdb
+	-del *.manifest
+
+distclean: clean
+	-del zconf$(SUFFIX).h
+	-del zlib$(SUFFIX).h
+	-del zlib_name_mangling$(SUFFIX).h
+	-del $(TOP)\win32\zlib.def
+	-del $(TOP)\win32\zlibcompat.def
+	-del $(TOP)\win32\zlib-ng.def
+	-del gzread.c
diff --git a/3rdparty/zlib-ng/win32/Makefile.msc b/3rdparty/zlib-ng/win32/Makefile.msc
new file mode 100644
index 0000000000..8392fe46e7
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/Makefile.msc
@@ -0,0 +1,292 @@
+# Makefile for zlib using Microsoft (Visual) C
+# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+#
+# Usage:
+#   nmake -f win32/Makefile.msc                          (standard build)
+#   nmake -f win32/Makefile.msc LOC=-DFOO                (nonstandard build)
+
+# The toplevel directory of the source tree.
+#
+TOP = .
+
+# optional build flags
+LOC =
+
+# variables
+STATICLIB = zlib.lib
+SHAREDLIB = zlib1.dll
+IMPLIB    = zdll.lib
+SYMBOL_PREFIX =
+
+CC = cl
+CXX = cl
+LD = link
+AR = lib
+RC = rc
+CP = copy /y
+INCLUDES = -I$(TOP) -I$(TOP)/arch/x86 -I$(TOP)/arch/generic
+CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES)
+CXXFLAGS  = -nologo -EHsc -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES)
+WFLAGS  = \
+	-D_CRT_SECURE_NO_DEPRECATE \
+	-D_CRT_NONSTDC_NO_DEPRECATE \
+	-DX86_FEATURES \
+	-DX86_PCLMULQDQ_CRC \
+	-DX86_SSE2 \
+	-DX86_SSE42 \
+	-DX86_SSSE3 \
+	-DX86_AVX2
+
+LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
+ARFLAGS = -nologo
+RCFLAGS = /dWIN32 /r
+DEFFILE = zlib.def
+RCFILE = zlib1.rc
+RESFILE = zlib1.res
+WITH_GZFILEOP = yes
+ZLIB_COMPAT =
+SUFFIX =
+
+OBJS = \
+	adler32.obj \
+	adler32_c.obj \
+	adler32_avx2.obj \
+	adler32_avx512.obj \
+	adler32_avx512_vnni.obj \
+	adler32_sse42.obj \
+	adler32_ssse3.obj \
+	adler32_fold_c.obj \
+	chunkset_c.obj \
+	chunkset_avx2.obj \
+	chunkset_sse2.obj \
+	chunkset_ssse3.obj \
+	compare256_c.obj \
+	compare256_avx2.obj \
+	compare256_sse2.obj \
+	compress.obj \
+	cpu_features.obj \
+	crc32.obj \
+	crc32_braid_c.obj \
+	crc32_braid_comb.obj \
+	crc32_fold_c.obj \
+	crc32_pclmulqdq.obj \
+	deflate.obj \
+	deflate_fast.obj \
+	deflate_huff.obj \
+	deflate_medium.obj \
+	deflate_quick.obj \
+	deflate_rle.obj \
+	deflate_slow.obj \
+	deflate_stored.obj \
+	functable.obj \
+	infback.obj \
+	inflate.obj \
+	inftrees.obj \
+	insert_string.obj \
+	insert_string_roll.obj \
+	slide_hash_c.obj \
+	slide_hash_avx2.obj \
+	slide_hash_sse2.obj \
+	trees.obj \
+	uncompr.obj \
+	zutil.obj \
+	x86_features.obj \
+	#
+!if "$(ZLIB_COMPAT)" != ""
+WITH_GZFILEOP = yes
+WFLAGS = $(WFLAGS) -DZLIB_COMPAT
+DEFFILE = zlibcompat.def
+!else
+STATICLIB = zlib-ng.lib
+SHAREDLIB = zlib-ng1.dll
+IMPLIB = zngdll.lib
+DEFFILE = zlib-ng.def
+RCFILE = zlib-ng1.rc
+RESFILE = zlib-ng1.res
+SUFFIX = -ng
+!endif
+
+!if "$(WITH_GZFILEOP)" != ""
+WFLAGS = $(WFLAGS) -DWITH_GZFILEOP
+OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj
+!endif
+
+# targets
+all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
+     example.exe minigzip.exe example_d.exe minigzip_d.exe
+
+!if "$(SYMBOL_PREFIX)" != ""
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+!else
+zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty
+	$(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h
+!endif
+
+zlib$(SUFFIX).h: zlib$(SUFFIX).h.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+gzread.c: gzread.c.in
+	cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h
+	$(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h
+
+$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in
+	cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)"
+
+$(STATICLIB): zconf $(OBJS)
+	$(AR) $(ARFLAGS) -out:$@ $(OBJS)
+
+$(IMPLIB): $(SHAREDLIB)
+
+$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE)
+	$(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \
+	  -out:$@ $(OBJS) $(RESFILE)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;2
+
+depcheck.exe: depcheck.obj
+	$(LD) $(LDFLAGS) depcheck.obj
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	$(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	$(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB)
+	if exist $@.manifest \
+	  mt -nologo -manifest $@.manifest -outputresource:$@;1
+
+{$(TOP)}.c.obj:
+	$(CC) -c $(WFLAGS) $(CFLAGS) $<
+
+gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c
+
+gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c
+
+gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+	$(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c
+
+{$(TOP)/arch/x86}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/arch/generic}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $<
+
+{$(TOP)/test}.c.obj:
+	$(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $<
+
+$(TOP)/zconf$(SUFFIX).h: zconf
+
+{$(TOP)/win32}.cpp.obj:
+	$(CXX) -c -I$(TOP) $(WFLAGS) $(CXXFLAGS) $<
+
+adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h
+adler32_avx2.obj: $(TOP)/arch/x86/adler32_avx2.c $(TOP)/zbuild.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx2_p.h $(TOP)/arch/x86/x86_intrins.h
+adler32_avx512.obj: $(TOP)/arch/x86/adler32_avx512.c $(TOP)/zbuild.h $(TOP)/arch_functions.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx512_p.h $(TOP)/arch/x86/x86_intrins.h
+adler32_avx512_vnni.obj: $(TOP)/arch/x86/adler32_avx512_vnni.c $(TOP)/zbuild.h $(TOP)/arch_functions.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx512_p.h \
+                  $(TOP)/arch/x86/adler32_avx2_p.h $(TOP)/arch/x86/x86_intrins.h
+adler32_sse42.obj: $(TOP)/arch/x86/adler32_sse42.c $(TOP)/zbuild.h $(TOP)/adler32_p.h \
+                   $(TOP)/arch/x86/adler32_ssse3_p.h
+adler32_ssse3.obj: $(TOP)/arch/x86/adler32_ssse3.c $(TOP)/zbuild.h $(TOP)/adler32_p.h \
+                   $(TOP)/arch/x86/adler32_ssse3_p.h
+adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h
+chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h
+chunkset_avx2.obj: $(TOP)/arch/x86/chunkset_avx2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h
+chunkset_sse2.obj: $(TOP)/arch/x86/chunkset_sse2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h
+chunkset_ssse3.obj: $(TOP)/arch/x86/chunkset_ssse3.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h
+compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h
+compare256_avx2.obj: $(TOP)/arch/x86/compare256_avx2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h
+compare256_sse2.obj: $(TOP)/arch/x86/compare256_sse2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h
+compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h
+cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h
+crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h
+crc32_pclmulqdq.obj: $(TOP)/arch/x86/crc32_pclmulqdq.c $(TOP)/arch/x86/crc32_pclmulqdq_tpl.h
+deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h
+deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h
+deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h
+functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/x86/x86_features.h $(TOP)/arch_functions.h
+gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h
+infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h
+inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h
+inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h
+insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h
+slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h
+slide_hash_avx2.obj: $(TOP)/arch/x86/slide_hash_avx2.c $(TOP)/zbuild.h $(TOP)/deflate.h
+slide_hash_sse2.obj: $(TOP)/arch/x86/slide_hash_sse2.c $(TOP)/zbuild.h $(TOP)/deflate.h
+trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h
+uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h
+zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h
+
+$(RESFILE): $(TOP)/win32/$(RCFILE)
+	$(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE)
+
+# testing
+depcheck: depcheck.exe
+        depcheck win32\Makefile.msc .
+        depcheck win32\Makefile.arm .
+        depcheck win32\Makefile.a64 .
+
+test: example.exe minigzip.exe depcheck
+	example
+	echo hello world | minigzip | minigzip -d
+
+testdll: example_d.exe minigzip_d.exe
+	example_d
+	echo hello world | minigzip_d | minigzip_d -d
+
+depcheck.obj: $(TOP)/win32/depcheck.cpp
+
+example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h
+
+minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h
+
+
+# cleanup
+clean:
+	-del $(STATICLIB)
+	-del $(SHAREDLIB)
+	-del $(IMPLIB)
+	-del *.obj
+	-del *.res
+	-del *.exp
+	-del *.exe
+	-del *.pdb
+	-del *.manifest
+
+distclean: clean
+	-del zconf$(SUFFIX).h
+	-del zlib$(SUFFIX).h
+	-del zlib_name_mangling$(SUFFIX).h
+	-del $(TOP)\win32\zlib.def
+	-del $(TOP)\win32\zlibcompat.def
+	-del $(TOP)\win32\zlib-ng.def
+	-del gzread.c
diff --git a/3rdparty/zlib-ng/win32/depcheck.cpp b/3rdparty/zlib-ng/win32/depcheck.cpp
new file mode 100644
index 0000000000..f83bdd6852
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/depcheck.cpp
@@ -0,0 +1,321 @@
+/* depcheck.cpp - Dependency checker for NMake Makefiles
+ * Copyright (c) 2024 Mika T. Lindqvist
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+int main(int argc, char* argv[]) {
+  if (argc != 3) {
+    printf("Usage: depcheck Makefile <top_directory>\n");
+    return -1;
+  }
+  std::filebuf fb;
+  if (fb.open (argv[1],std::ios::in)) {
+    std::istream is(&fb);
+    std::string makefile = argv[1];
+    std::string l, tmp, tmp2;
+    while (is) {
+      std::getline(is, l);
+      while (l.back() == '\\') {
+        std::getline(is, tmp);
+        l.replace(l.length() - 1, 1, tmp);
+      }
+      size_t pos = l.find("obj:");
+      if (pos != std::string::npos) {
+         std::string objfile = l.substr(0, pos+3);
+         printf("File: %s\n", objfile.c_str());
+         std::vector<std::string> files;
+         std::stringstream ss(l.substr(pos+4));
+         while(getline(ss, tmp, ' ')){
+           if (tmp != "" && tmp != "/") {
+             files.push_back(tmp);
+           }
+         }
+         for (auto it = files.begin(); it != files.end(); ++it) {
+           printf("Dependency: %s\n", (*it).c_str());
+         }
+         if (!files.empty()) {
+           std::filebuf fb2;
+           std::string src = files[0];
+           size_t pos2 = src.find("$(TOP)");
+           if (pos2 != std::string::npos) {
+             src.replace(pos2, 6, argv[2]);
+           }
+           printf("Source: %s\n", src.c_str());
+           if (fb2.open(src.c_str(),std::ios::in)) {
+             std::istream is2(&fb2);
+             std::vector<std::string> includes;
+             while (is2) {
+               std::getline(is2, l);
+               pos = l.find("#");
+               if (pos != std::string::npos) {
+                 pos2 = l.find("include");
+                 size_t pos3 = l.find("\"");
+                 if (pos2 != std::string::npos && pos3 != std::string::npos && pos2 > pos && pos3 > pos2) {
+                   tmp = l.substr(pos3 + 1);
+                   pos2 = tmp.find("\"");
+                   if (pos2 != std::string::npos) {
+                     tmp = tmp.substr(0, pos2);
+                   }
+                   pos2 = tmp.find("../");
+                   if (pos2 != std::string::npos) {
+                     tmp = tmp.substr(3);
+                   }
+                   printf("Line: %s\n", tmp.c_str());
+                   int found = 0;
+                   for (size_t i = 1; i < files.size(); i++) {
+                     pos3 = files[i].find("$(SUFFIX)");
+                     if (pos3 != std::string::npos) {
+                       tmp2 = files[i].substr(0, pos3).append(files[i].substr(pos3 + 9));
+                       printf("Comparing dependency \"%s\" and \"%s\"\n", tmp2.c_str(), tmp.c_str());
+                       if (tmp2 == tmp) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", tmp2.c_str(), tmp.c_str());
+                       if (tmp2 == std::string("$(TOP)/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+
+                       tmp2 = files[i].substr(0, pos3).append("-ng").append(files[i].substr(pos3 + 9));
+                       printf("Comparing dependency \"%s\" and \"%s\"\n", tmp2.c_str(), tmp.c_str());
+                       if (tmp2 == tmp) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", tmp2.c_str(), tmp.c_str());
+                       if (tmp2 == std::string("$(TOP)/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                     } else {
+                       printf("Comparing dependency \"%s\" and \"%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == tmp) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/arch/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/arch/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/arch/generic/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/arch/generic/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/arch/arm/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/arch/arm/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/arch/x86/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/arch/x86/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                       printf("Comparing dependency \"%s\" and \"$(TOP)/test/%s\"\n", files[i].c_str(), tmp.c_str());
+                       if (files[i] == std::string("$(TOP)/test/").append(tmp)) {
+                         printf("Dependency %s OK\n", tmp.c_str());
+                         found = 1;
+                         includes.push_back(tmp);
+                         break;
+                       }
+                     }
+                   }
+                   // Skip irrelevant dependencies
+                   if (tmp.substr(0, 9) == "arch/s390") found = 1;
+                   if (tmp == "zlib-ng.h" && std::find(includes.begin(), includes.end(), "zlib.h") != includes.end()) found = 1;
+                   if (found == 0) {
+                     printf("%s: Dependency %s missing for %s!\n", makefile.c_str(), tmp.c_str(), objfile.c_str());
+                     return -1;
+                   }
+                 }
+               }
+             }
+             for (size_t i = 1; i < files.size(); i++) {
+               int found = 0;
+               tmp = files[i];
+               printf("Dependency: %s\n", tmp.c_str());
+               pos2 = tmp.find("$(TOP)");
+               if (pos2 != std::string::npos) {
+                 tmp = tmp.substr(7);
+               }
+               for (size_t j = 0; j < includes.size(); j++) {
+                 pos2 = tmp.find("$(SUFFIX)");
+                 if (pos2 != std::string::npos) {
+                   std::string tmp1 = tmp.substr(0, pos2).append(tmp.substr(pos2 + 9));
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == includes[j]) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/generic/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/arm/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/x86/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("test/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   tmp1 = tmp.substr(0, pos2).append("-ng").append(tmp.substr(pos2 + 9));
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == includes[j]) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/generic/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/arm/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("arch/x86/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str());
+                   if (tmp1 == std::string("test/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                 } else {
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == includes[j]) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == std::string("arch/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == std::string("arch/generic/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == std::string("arch/arm/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == std::string("arch/x86/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                   printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str());
+                   if (tmp == std::string("test/").append(includes[j])) {
+                     printf("Dependency %s OK\n", files[i].c_str());
+                     found = 1;
+                     break;
+                   }
+                 }
+               }
+               // Skip indirect dependencies
+               if (tmp.find("arm_features.h") != std::string::npos
+                   && std::find(includes.begin(), includes.end(), "cpu_features.h") != includes.end()
+                   && (makefile.find(".arm") != std::string::npos
+                      || makefile.find(".a64") != std::string::npos)) found = 1;
+               if (tmp.find("x86_features.h") != std::string::npos
+                   && std::find(includes.begin(), includes.end(), "cpu_features.h") != includes.end()
+                   && makefile.find(".msc") != std::string::npos) found = 1;
+               //
+               if (tmp.find("generic_functions.h") != std::string::npos
+                   && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end()) found = 1;
+               if (tmp.find("arm_functions.h") != std::string::npos
+                   && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end()
+                   && (makefile.find(".arm") != std::string::npos
+                      || makefile.find(".a64") != std::string::npos)) found = 1;
+               if (tmp.find("x86_functions.h") != std::string::npos
+                   && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end()
+                   && makefile.find(".msc") != std::string::npos) found = 1;
+               if (found == 0) {
+                 printf("%s: Dependency %s not needed for %s\n", makefile.c_str(), files[i].c_str(), objfile.c_str());
+                 return -1;
+               }
+             }
+             fb2.close();
+           }
+         }
+      }
+    }
+    fb.close();
+  }
+  return 0;
+}
diff --git a/3rdparty/zlib-ng/win32/replace.vbs b/3rdparty/zlib-ng/win32/replace.vbs
new file mode 100644
index 0000000000..6779971d07
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/replace.vbs
@@ -0,0 +1,15 @@
+strInputFileName = Wscript.Arguments(0)
+strOutputFileName = Wscript.Arguments(1)
+strOldText = Wscript.Arguments(2)
+strNewText = Wscript.Arguments(3)
+
+Set objFSO = CreateObject("Scripting.FileSystemObject")
+Set objFile = objFSO.OpenTextFile(strInputFileName, 1)
+
+strText = objFile.ReadAll
+objFile.Close
+strNewText = Replace(strText, strOldText, strNewText)
+
+Set objFile = objFSO.OpenTextFile(strOutputFileName, 2, True)
+objFile.Write strNewText
+objFile.Close
diff --git a/3rdparty/zlib-ng/win32/zlib-ng.def.in b/3rdparty/zlib-ng/win32/zlib-ng.def.in
new file mode 100644
index 0000000000..53b2bc21f7
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/zlib-ng.def.in
@@ -0,0 +1,60 @@
+; zlib-ng data compression library
+EXPORTS
+; basic functions
+    @ZLIB_SYMBOL_PREFIX@zlibng_version
+    @ZLIB_SYMBOL_PREFIX@zng_deflate
+    @ZLIB_SYMBOL_PREFIX@zng_deflateEnd
+    @ZLIB_SYMBOL_PREFIX@zng_deflateInit
+    @ZLIB_SYMBOL_PREFIX@zng_deflateInit2
+    @ZLIB_SYMBOL_PREFIX@zng_inflate
+    @ZLIB_SYMBOL_PREFIX@zng_inflateEnd
+    @ZLIB_SYMBOL_PREFIX@zng_inflateInit
+    @ZLIB_SYMBOL_PREFIX@zng_inflateInit2
+    @ZLIB_SYMBOL_PREFIX@zng_inflateBackInit
+; advanced functions
+    @ZLIB_SYMBOL_PREFIX@zng_deflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@zng_deflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@zng_deflateCopy
+    @ZLIB_SYMBOL_PREFIX@zng_deflateReset
+    @ZLIB_SYMBOL_PREFIX@zng_deflateParams
+    @ZLIB_SYMBOL_PREFIX@zng_deflateTune
+    @ZLIB_SYMBOL_PREFIX@zng_deflateBound
+    @ZLIB_SYMBOL_PREFIX@zng_deflatePending
+    @ZLIB_SYMBOL_PREFIX@zng_deflatePrime
+    @ZLIB_SYMBOL_PREFIX@zng_deflateSetHeader
+    @ZLIB_SYMBOL_PREFIX@zng_deflateSetParams
+    @ZLIB_SYMBOL_PREFIX@zng_deflateGetParams
+    @ZLIB_SYMBOL_PREFIX@zng_inflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@zng_inflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@zng_inflateSync
+    @ZLIB_SYMBOL_PREFIX@zng_inflateCopy
+    @ZLIB_SYMBOL_PREFIX@zng_inflateReset
+    @ZLIB_SYMBOL_PREFIX@zng_inflateReset2
+    @ZLIB_SYMBOL_PREFIX@zng_inflatePrime
+    @ZLIB_SYMBOL_PREFIX@zng_inflateMark
+    @ZLIB_SYMBOL_PREFIX@zng_inflateGetHeader
+    @ZLIB_SYMBOL_PREFIX@zng_inflateBack
+    @ZLIB_SYMBOL_PREFIX@zng_inflateBackEnd
+    @ZLIB_SYMBOL_PREFIX@zng_zlibCompileFlags
+; utility functions
+    @ZLIB_SYMBOL_PREFIX@zng_compress
+    @ZLIB_SYMBOL_PREFIX@zng_compress2
+    @ZLIB_SYMBOL_PREFIX@zng_compressBound
+    @ZLIB_SYMBOL_PREFIX@zng_uncompress
+    @ZLIB_SYMBOL_PREFIX@zng_uncompress2
+; checksum functions
+    @ZLIB_SYMBOL_PREFIX@zng_adler32
+    @ZLIB_SYMBOL_PREFIX@zng_adler32_z
+    @ZLIB_SYMBOL_PREFIX@zng_crc32
+    @ZLIB_SYMBOL_PREFIX@zng_crc32_z
+    @ZLIB_SYMBOL_PREFIX@zng_adler32_combine
+    @ZLIB_SYMBOL_PREFIX@zng_crc32_combine
+; various hacks, don't look :)
+    @ZLIB_SYMBOL_PREFIX@zng_zError
+    @ZLIB_SYMBOL_PREFIX@zng_inflateSyncPoint
+    @ZLIB_SYMBOL_PREFIX@zng_get_crc_table
+    @ZLIB_SYMBOL_PREFIX@zng_inflateUndermine
+    @ZLIB_SYMBOL_PREFIX@zng_inflateValidate
+    @ZLIB_SYMBOL_PREFIX@zng_inflateCodesUsed
+    @ZLIB_SYMBOL_PREFIX@zng_inflateResetKeep
+    @ZLIB_SYMBOL_PREFIX@zng_deflateResetKeep
diff --git a/3rdparty/zlib-ng/win32/zlib-ng1.rc b/3rdparty/zlib-ng/win32/zlib-ng1.rc
new file mode 100644
index 0000000000..f65cfa254e
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/zlib-ng1.rc
@@ -0,0 +1,36 @@
+#include <winver.h>
+#include "zlib-ng.h"
+
+VS_VERSION_INFO		VERSIONINFO
+  FILEVERSION		ZLIBNG_VER_MAJOR,ZLIBNG_VER_MINOR,ZLIBNG_VER_REVISION,0
+  PRODUCTVERSION	ZLIBNG_VER_MAJOR,ZLIBNG_VER_MINOR,ZLIBNG_VER_REVISION,0
+  FILEFLAGSMASK		VS_FFI_FILEFLAGSMASK
+#ifdef _DEBUG
+  FILEFLAGS		1
+#else
+  FILEFLAGS		0
+#endif
+  FILEOS		VOS__WINDOWS32
+  FILETYPE		VFT_DLL
+  FILESUBTYPE		0	// not used
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    //language ID = U.S. English, char set = Windows, Multilingual
+    BEGIN
+      VALUE "FileDescription",	"zlib data compression library\0"
+      VALUE "FileVersion",	ZLIBNG_VERSION "\0"
+      VALUE "InternalName",	"zlib-ng1.dll\0"
+      VALUE "LegalCopyright",	"(C) 1995-2024 Jean-loup Gailly & Mark Adler\0"
+      VALUE "OriginalFilename",	"zlib-ng1.dll\0"
+      VALUE "ProductName",	"zlib\0"
+      VALUE "ProductVersion",	ZLIBNG_VERSION "\0"
+      VALUE "Comments",		"For more information visit https://www.zlib.net/\0"
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x0409, 1252
+  END
+END
diff --git a/3rdparty/zlib-ng/win32/zlib.def.in b/3rdparty/zlib-ng/win32/zlib.def.in
new file mode 100644
index 0000000000..561a42f7f8
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/zlib.def.in
@@ -0,0 +1,64 @@
+; zlib data compression library
+EXPORTS
+; basic functions
+    @ZLIB_SYMBOL_PREFIX@zlibVersion
+    @ZLIB_SYMBOL_PREFIX@deflate
+    @ZLIB_SYMBOL_PREFIX@deflateEnd
+    @ZLIB_SYMBOL_PREFIX@inflate
+    @ZLIB_SYMBOL_PREFIX@inflateEnd
+; advanced functions
+    @ZLIB_SYMBOL_PREFIX@deflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@deflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@deflateCopy
+    @ZLIB_SYMBOL_PREFIX@deflateReset
+    @ZLIB_SYMBOL_PREFIX@deflateParams
+    @ZLIB_SYMBOL_PREFIX@deflateTune
+    @ZLIB_SYMBOL_PREFIX@deflateBound
+    @ZLIB_SYMBOL_PREFIX@deflatePending
+    @ZLIB_SYMBOL_PREFIX@deflatePrime
+    @ZLIB_SYMBOL_PREFIX@deflateSetHeader
+    @ZLIB_SYMBOL_PREFIX@inflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@inflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@inflateSync
+    @ZLIB_SYMBOL_PREFIX@inflateCopy
+    @ZLIB_SYMBOL_PREFIX@inflateReset
+    @ZLIB_SYMBOL_PREFIX@inflateReset2
+    @ZLIB_SYMBOL_PREFIX@inflatePrime
+    @ZLIB_SYMBOL_PREFIX@inflateMark
+    @ZLIB_SYMBOL_PREFIX@inflateGetHeader
+    @ZLIB_SYMBOL_PREFIX@inflateBack
+    @ZLIB_SYMBOL_PREFIX@inflateBackEnd
+    @ZLIB_SYMBOL_PREFIX@zlibCompileFlags
+; utility functions
+    @ZLIB_SYMBOL_PREFIX@compress
+    @ZLIB_SYMBOL_PREFIX@compress2
+    @ZLIB_SYMBOL_PREFIX@compressBound
+    @ZLIB_SYMBOL_PREFIX@uncompress
+    @ZLIB_SYMBOL_PREFIX@uncompress2
+; large file functions
+    @ZLIB_SYMBOL_PREFIX@adler32_combine64
+    @ZLIB_SYMBOL_PREFIX@crc32_combine64
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64
+; checksum functions
+    @ZLIB_SYMBOL_PREFIX@adler32
+    @ZLIB_SYMBOL_PREFIX@adler32_z
+    @ZLIB_SYMBOL_PREFIX@crc32
+    @ZLIB_SYMBOL_PREFIX@crc32_z
+    @ZLIB_SYMBOL_PREFIX@adler32_combine
+    @ZLIB_SYMBOL_PREFIX@crc32_combine
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_gen
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_op
+; various hacks, don't look :)
+    @ZLIB_SYMBOL_PREFIX@deflateInit_
+    @ZLIB_SYMBOL_PREFIX@deflateInit2_
+    @ZLIB_SYMBOL_PREFIX@inflateInit_
+    @ZLIB_SYMBOL_PREFIX@inflateInit2_
+    @ZLIB_SYMBOL_PREFIX@inflateBackInit_
+    @ZLIB_SYMBOL_PREFIX@zError
+    @ZLIB_SYMBOL_PREFIX@inflateSyncPoint
+    @ZLIB_SYMBOL_PREFIX@get_crc_table
+    @ZLIB_SYMBOL_PREFIX@inflateUndermine
+    @ZLIB_SYMBOL_PREFIX@inflateValidate
+    @ZLIB_SYMBOL_PREFIX@inflateCodesUsed
+    @ZLIB_SYMBOL_PREFIX@inflateResetKeep
+    @ZLIB_SYMBOL_PREFIX@deflateResetKeep
diff --git a/3rdparty/zlib-ng/win32/zlib1.rc b/3rdparty/zlib-ng/win32/zlib1.rc
new file mode 100644
index 0000000000..9bb9c18654
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/zlib1.rc
@@ -0,0 +1,36 @@
+#include <winver.h>
+#include "zlib.h"
+
+VS_VERSION_INFO		VERSIONINFO
+  FILEVERSION		ZLIB_VER_MAJOR,ZLIB_VER_MINOR,ZLIB_VER_REVISION,0
+  PRODUCTVERSION	ZLIB_VER_MAJOR,ZLIB_VER_MINOR,ZLIB_VER_REVISION,0
+  FILEFLAGSMASK		VS_FFI_FILEFLAGSMASK
+#ifdef _DEBUG
+  FILEFLAGS		1
+#else
+  FILEFLAGS		0
+#endif
+  FILEOS		VOS__WINDOWS32
+  FILETYPE		VFT_DLL
+  FILESUBTYPE		0	// not used
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    //language ID = U.S. English, char set = Windows, Multilingual
+    BEGIN
+      VALUE "FileDescription",	"zlib data compression library\0"
+      VALUE "FileVersion",	ZLIB_VERSION "\0"
+      VALUE "InternalName",	"zlib1.dll\0"
+      VALUE "LegalCopyright",	"(C) 1995-2024 Jean-loup Gailly & Mark Adler\0"
+      VALUE "OriginalFilename",	"zlib1.dll\0"
+      VALUE "ProductName",	"zlib\0"
+      VALUE "ProductVersion",	ZLIB_VERSION "\0"
+      VALUE "Comments",		"For more information visit https://www.zlib.net/\0"
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x0409, 1252
+  END
+END
diff --git a/3rdparty/zlib-ng/win32/zlibcompat.def.in b/3rdparty/zlib-ng/win32/zlibcompat.def.in
new file mode 100644
index 0000000000..52a713cf03
--- /dev/null
+++ b/3rdparty/zlib-ng/win32/zlibcompat.def.in
@@ -0,0 +1,97 @@
+; zlib data compression library
+EXPORTS
+; basic functions
+    @ZLIB_SYMBOL_PREFIX@zlibVersion
+    @ZLIB_SYMBOL_PREFIX@deflate
+    @ZLIB_SYMBOL_PREFIX@deflateEnd
+    @ZLIB_SYMBOL_PREFIX@inflate
+    @ZLIB_SYMBOL_PREFIX@inflateEnd
+; advanced functions
+    @ZLIB_SYMBOL_PREFIX@deflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@deflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@deflateCopy
+    @ZLIB_SYMBOL_PREFIX@deflateReset
+    @ZLIB_SYMBOL_PREFIX@deflateParams
+    @ZLIB_SYMBOL_PREFIX@deflateTune
+    @ZLIB_SYMBOL_PREFIX@deflateBound
+    @ZLIB_SYMBOL_PREFIX@deflatePending
+    @ZLIB_SYMBOL_PREFIX@deflatePrime
+    @ZLIB_SYMBOL_PREFIX@deflateSetHeader
+    @ZLIB_SYMBOL_PREFIX@inflateSetDictionary
+    @ZLIB_SYMBOL_PREFIX@inflateGetDictionary
+    @ZLIB_SYMBOL_PREFIX@inflateSync
+    @ZLIB_SYMBOL_PREFIX@inflateCopy
+    @ZLIB_SYMBOL_PREFIX@inflateReset
+    @ZLIB_SYMBOL_PREFIX@inflateReset2
+    @ZLIB_SYMBOL_PREFIX@inflatePrime
+    @ZLIB_SYMBOL_PREFIX@inflateMark
+    @ZLIB_SYMBOL_PREFIX@inflateGetHeader
+    @ZLIB_SYMBOL_PREFIX@inflateBack
+    @ZLIB_SYMBOL_PREFIX@inflateBackEnd
+    @ZLIB_SYMBOL_PREFIX@zlibCompileFlags
+; utility functions
+    @ZLIB_SYMBOL_PREFIX@compress
+    @ZLIB_SYMBOL_PREFIX@compress2
+    @ZLIB_SYMBOL_PREFIX@compressBound
+    @ZLIB_SYMBOL_PREFIX@uncompress
+    @ZLIB_SYMBOL_PREFIX@uncompress2
+    @ZLIB_SYMBOL_PREFIX@gzopen
+    @ZLIB_SYMBOL_PREFIX@gzdopen
+    @ZLIB_SYMBOL_PREFIX@gzbuffer
+    @ZLIB_SYMBOL_PREFIX@gzsetparams
+    @ZLIB_SYMBOL_PREFIX@gzread
+    @ZLIB_SYMBOL_PREFIX@gzfread
+    @ZLIB_SYMBOL_PREFIX@gzwrite
+    @ZLIB_SYMBOL_PREFIX@gzfwrite
+    @ZLIB_SYMBOL_PREFIX@gzprintf
+    @ZLIB_SYMBOL_PREFIX@gzvprintf
+    @ZLIB_SYMBOL_PREFIX@gzputs
+    @ZLIB_SYMBOL_PREFIX@gzgets
+    @ZLIB_SYMBOL_PREFIX@gzputc
+    @ZLIB_SYMBOL_PREFIX@gzgetc
+    @ZLIB_SYMBOL_PREFIX@gzungetc
+    @ZLIB_SYMBOL_PREFIX@gzflush
+    @ZLIB_SYMBOL_PREFIX@gzseek
+    @ZLIB_SYMBOL_PREFIX@gzrewind
+    @ZLIB_SYMBOL_PREFIX@gztell
+    @ZLIB_SYMBOL_PREFIX@gzoffset
+    @ZLIB_SYMBOL_PREFIX@gzeof
+    @ZLIB_SYMBOL_PREFIX@gzdirect
+    @ZLIB_SYMBOL_PREFIX@gzclose
+    @ZLIB_SYMBOL_PREFIX@gzclose_r
+    @ZLIB_SYMBOL_PREFIX@gzclose_w
+    @ZLIB_SYMBOL_PREFIX@gzerror
+    @ZLIB_SYMBOL_PREFIX@gzclearerr
+; large file functions
+    @ZLIB_SYMBOL_PREFIX@gzopen64
+    @ZLIB_SYMBOL_PREFIX@gzseek64
+    @ZLIB_SYMBOL_PREFIX@gztell64
+    @ZLIB_SYMBOL_PREFIX@gzoffset64
+    @ZLIB_SYMBOL_PREFIX@adler32_combine64
+    @ZLIB_SYMBOL_PREFIX@crc32_combine64
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64
+; checksum functions
+    @ZLIB_SYMBOL_PREFIX@adler32
+    @ZLIB_SYMBOL_PREFIX@adler32_z
+    @ZLIB_SYMBOL_PREFIX@crc32
+    @ZLIB_SYMBOL_PREFIX@crc32_z
+    @ZLIB_SYMBOL_PREFIX@adler32_combine
+    @ZLIB_SYMBOL_PREFIX@crc32_combine
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_gen
+    @ZLIB_SYMBOL_PREFIX@crc32_combine_op
+; various hacks, don't look :)
+    @ZLIB_SYMBOL_PREFIX@deflateInit_
+    @ZLIB_SYMBOL_PREFIX@deflateInit2_
+    @ZLIB_SYMBOL_PREFIX@inflateInit_
+    @ZLIB_SYMBOL_PREFIX@inflateInit2_
+    @ZLIB_SYMBOL_PREFIX@inflateBackInit_
+    @ZLIB_SYMBOL_PREFIX@gzgetc_
+    @ZLIB_SYMBOL_PREFIX@zError
+    @ZLIB_SYMBOL_PREFIX@inflateSyncPoint
+    @ZLIB_SYMBOL_PREFIX@get_crc_table
+    @ZLIB_SYMBOL_PREFIX@inflateUndermine
+    @ZLIB_SYMBOL_PREFIX@inflateValidate
+    @ZLIB_SYMBOL_PREFIX@inflateCodesUsed
+    @ZLIB_SYMBOL_PREFIX@inflateResetKeep
+    @ZLIB_SYMBOL_PREFIX@deflateResetKeep
+    @ZLIB_SYMBOL_PREFIX@gzopen_w
diff --git a/3rdparty/zlib-ng/zbuild.h b/3rdparty/zlib-ng/zbuild.h
index d550b4c582..9157eef9e3 100644
--- a/3rdparty/zlib-ng/zbuild.h
+++ b/3rdparty/zlib-ng/zbuild.h
@@ -202,6 +202,24 @@
 #  define ALIGNED_(x) __declspec(align(x))
 #endif
 
+#ifdef HAVE_BUILTIN_ASSUME_ALIGNED
+#  define HINT_ALIGNED(p,n) __builtin_assume_aligned((void *)(p),(n))
+#else
+#  define HINT_ALIGNED(p,n) (p)
+#endif
+#define HINT_ALIGNED_16(p) HINT_ALIGNED((p),16)
+#define HINT_ALIGNED_64(p) HINT_ALIGNED((p),64)
+#define HINT_ALIGNED_4096(p) HINT_ALIGNED((p),4096)
+
+/* PADSZ returns needed bytes to pad bpos to pad size
+ * PAD_NN calculates pad size and adds it to bpos, returning the result.
+ * All take an integer or a pointer as bpos input.
+ */
+#define PADSZ(bpos, pad) (((pad) - ((uintptr_t)(bpos) % (pad))) % (pad))
+#define PAD_16(bpos) ((bpos) + PADSZ((bpos),16))
+#define PAD_64(bpos) ((bpos) + PADSZ((bpos),64))
+#define PAD_4096(bpos) ((bpos) + PADSZ((bpos),4096))
+
 /* Diagnostic functions */
 #ifdef ZLIB_DEBUG
 #  include <stdio.h>
@@ -246,6 +264,31 @@
 #  endif
 #endif
 
+#if defined(__has_feature)
+#  if __has_feature(address_sanitizer)
+#    define Z_ADDRESS_SANITIZER 1
+#  endif
+#elif defined(__SANITIZE_ADDRESS__)
+#  define Z_ADDRESS_SANITIZER 1
+#endif
+
+/*
+ * __asan_loadN() and __asan_storeN() calls are inserted by compilers in order to check memory accesses.
+ * They can be called manually too, with the following caveats:
+ * gcc says: "warning: implicit declaration of function ‘...’"
+ * g++ says: "error: new declaration ‘...’ ambiguates built-in declaration ‘...’"
+ * Accommodate both.
+ */
+#ifdef Z_ADDRESS_SANITIZER
+#ifndef __cplusplus
+void __asan_loadN(void *, long);
+void __asan_storeN(void *, long);
+#endif
+#else
+#  define __asan_loadN(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0)
+#  define __asan_storeN(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0)
+#endif
+
 #if defined(__has_feature)
 #  if __has_feature(memory_sanitizer)
 #    define Z_MEMORY_SANITIZER 1
@@ -254,7 +297,31 @@
 #endif
 
 #ifndef Z_MEMORY_SANITIZER
+#  define __msan_check_mem_is_initialized(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0)
 #  define __msan_unpoison(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0)
 #endif
 
+/* Notify sanitizer runtime about an upcoming read access. */
+#define instrument_read(a, size) do {             \
+    void *__a = (void *)(a);                      \
+    long __size = size;                           \
+    __asan_loadN(__a, __size);                    \
+    __msan_check_mem_is_initialized(__a, __size); \
+} while (0)
+
+/* Notify sanitizer runtime about an upcoming write access. */
+#define instrument_write(a, size) do { \
+   void *__a = (void *)(a);            \
+   long __size = size;                 \
+   __asan_storeN(__a, __size);         \
+} while (0)
+
+/* Notify sanitizer runtime about an upcoming read/write access. */
+#define instrument_read_write(a, size) do {       \
+    void *__a = (void *)(a);                      \
+    long __size = size;                           \
+    __asan_storeN(__a, __size);                   \
+    __msan_check_mem_is_initialized(__a, __size); \
+} while (0)
+
 #endif
diff --git a/3rdparty/zlib-ng/zconf-ng.h.in b/3rdparty/zlib-ng/zconf-ng.h.in
new file mode 100644
index 0000000000..a1b5311b85
--- /dev/null
+++ b/3rdparty/zlib-ng/zconf-ng.h.in
@@ -0,0 +1,176 @@
+/* zconf-ng.h -- configuration of the zlib-ng compression library
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ZCONFNG_H
+#define ZCONFNG_H
+
+#include "zlib_name_mangling-ng.h"
+
+#if !defined(_WIN32) && defined(__WIN32__)
+#  define _WIN32
+#endif
+
+/* Clang macro for detecting declspec support
+ * https://clang.llvm.org/docs/LanguageExtensions.html#has-declspec-attribute
+ */
+#ifndef __has_declspec_attribute
+#  define __has_declspec_attribute(x) 0
+#endif
+
+/* Always define z_const as const */
+#define z_const const
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  define MAX_MEM_LEVEL 9
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MIN_WBITS
+#  define MIN_WBITS   8  /* 256 LZ77 window */
+#endif
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus about 7 kilobytes
+ for small objects.
+*/
+
+/* Type declarations */
+
+#ifdef ZLIB_INTERNAL
+#  define Z_INTERNAL ZLIB_INTERNAL
+#endif
+
+/* If building or using zlib as a DLL, define ZLIB_DLL.
+ * This is not mandatory, but it offers a little performance increase.
+ */
+#if defined(ZLIB_DLL) && (defined(_WIN32) || (__has_declspec_attribute(dllexport) && __has_declspec_attribute(dllimport)))
+#  ifdef Z_INTERNAL
+#    define Z_EXTERN extern __declspec(dllexport)
+#  else
+#    define Z_EXTERN extern __declspec(dllimport)
+#  endif
+#endif
+
+/* If building or using zlib with the WINAPI/WINAPIV calling convention,
+ * define ZLIB_WINAPI.
+ * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+ */
+#if defined(ZLIB_WINAPI) && defined(_WIN32)
+#  include <windows.h>
+   /* No need for _export, use ZLIB.DEF instead. */
+   /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#  define Z_EXPORT WINAPI
+#  define Z_EXPORTVA WINAPIV
+#endif
+
+#ifndef Z_EXTERN
+#  define Z_EXTERN extern
+#endif
+#ifndef Z_EXPORT
+#  define Z_EXPORT
+#endif
+#ifndef Z_EXPORTVA
+#  define Z_EXPORTVA
+#endif
+
+/* Conditional exports */
+#define ZNG_CONDEXPORT Z_EXPORT
+
+/* Fallback for something that includes us. */
+typedef unsigned char Byte;
+typedef Byte Bytef;
+
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+typedef char  charf;
+typedef int   intf;
+typedef uInt  uIntf;
+typedef uLong uLongf;
+
+typedef void const *voidpc;
+typedef void       *voidpf;
+typedef void       *voidp;
+
+#ifdef HAVE_UNISTD_H    /* may be set to #if 1 by configure/cmake/etc */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */
+typedef PTRDIFF_TYPE ptrdiff_t;
+#endif
+
+#include <sys/types.h>      /* for off_t */
+
+#include <stddef.h>         /* for wchar_t and NULL */
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+#  include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#  ifndef z_off_t
+#    define z_off_t off_t
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && defined(WITH_GZFILEOP)
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(__MSYS__)
+#    define z_off64_t _off64_t
+#  elif defined(_WIN32) && !defined(__GNUC__)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+
+#endif /* ZCONFNG_H */
diff --git a/3rdparty/zlib-ng/zconf.h.in b/3rdparty/zlib-ng/zconf.h.in
index 7a6e281e84..be8221fd86 100644
--- a/3rdparty/zlib-ng/zconf.h.in
+++ b/3rdparty/zlib-ng/zconf.h.in
@@ -1,5 +1,5 @@
 /* zconf.h -- configuration of the zlib compression library
- * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
diff --git a/3rdparty/zlib-ng/zlib.h.in b/3rdparty/zlib-ng/zlib.h.in
index eabb94afe0..3dceaa3344 100644
--- a/3rdparty/zlib-ng/zlib.h.in
+++ b/3rdparty/zlib-ng/zlib.h.in
@@ -1,9 +1,9 @@
 #ifndef ZLIB_H_
 #define ZLIB_H_
 /* zlib.h -- interface of the 'zlib-ng' compression library
-   Forked from and compatible with zlib 1.2.13
+   Forked from and compatible with zlib 1.3.1
 
-  Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
+  Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
@@ -49,20 +49,20 @@
 extern "C" {
 #endif
 
-#define ZLIBNG_VERSION "2.1.6"
-#define ZLIBNG_VERNUM 0x020106F0L   /* MMNNRRSM: major minor revision status modified */
+#define ZLIBNG_VERSION "2.2.1"
+#define ZLIBNG_VERNUM 0x020201F0L   /* MMNNRRSM: major minor revision status modified */
 #define ZLIBNG_VER_MAJOR 2
-#define ZLIBNG_VER_MINOR 1
-#define ZLIBNG_VER_REVISION 6
+#define ZLIBNG_VER_MINOR 2
+#define ZLIBNG_VER_REVISION 1
 #define ZLIBNG_VER_STATUS F         /* 0=devel, 1-E=beta, F=Release (DEPRECATED) */
 #define ZLIBNG_VER_STATUSH 0xF      /* Hex values: 0=devel, 1-E=beta, F=Release */
 #define ZLIBNG_VER_MODIFIED 0       /* non-zero if modified externally from zlib-ng */
 
-#define ZLIB_VERSION "1.3.0.zlib-ng"
-#define ZLIB_VERNUM 0x130f
+#define ZLIB_VERSION "1.3.1.zlib-ng"
+#define ZLIB_VERNUM 0x131f
 #define ZLIB_VER_MAJOR 1
 #define ZLIB_VER_MINOR 3
-#define ZLIB_VER_REVISION 0
+#define ZLIB_VER_REVISION 1
 #define ZLIB_VER_SUBREVISION 15    /* 15=fork (0xf) */
 
 /*
@@ -220,7 +220,7 @@ typedef gz_header *gz_headerp;
 #define Z_DEFLATED   8
 /* The deflate compression method (the only one supported in this version) */
 
-#define Z_NULL  NULL  /* for compatibility with zlib, was for initializing zalloc, zfree, opaque */
+#define Z_NULL  0  /* for compatibility with zlib, was for initializing zalloc, zfree, opaque */
 
 #define zlib_version zlibVersion()
 /* for compatibility with versions < 1.0.2 */
@@ -1732,14 +1732,14 @@ Z_EXTERN unsigned long Z_EXPORT crc32_combine(unsigned long crc1, unsigned long
    seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
    calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
    check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
-   len2.
+   len2. len2 must be non-negative.
 */
 
 /*
 Z_EXTERN unsigned long Z_EXPORT crc32_combine_gen(z_off_t len2);
 
      Return the operator corresponding to length len2, to be used with
-   crc32_combine_op().
+   crc32_combine_op(). len2 must be non-negative.
 */
 
 Z_EXTERN unsigned long Z_EXPORT crc32_combine_op(unsigned long crc1, unsigned long crc2,
diff --git a/3rdparty/zlib-ng/zlib.map b/3rdparty/zlib-ng/zlib.map
new file mode 100644
index 0000000000..293e803729
--- /dev/null
+++ b/3rdparty/zlib-ng/zlib.map
@@ -0,0 +1,98 @@
+ZLIB_1.2.0 {
+  global:
+    compressBound;
+    deflateBound;
+    inflateBack;
+    inflateBackEnd;
+    inflateBackInit_;
+    inflateCopy;
+  local:
+    deflate_copyright;
+    inflate_copyright;
+    zcalloc;
+    zcfree;
+    z_errmsg;
+    gz_error;
+    gz_intmax;
+    _*;
+};
+
+ZLIB_1.2.0.2 {
+    gzclearerr;
+    gzungetc;
+    zlibCompileFlags;
+} ZLIB_1.2.0;
+
+ZLIB_1.2.0.8 {
+    deflatePrime;
+} ZLIB_1.2.0.2;
+
+ZLIB_1.2.2 {
+    adler32_combine;
+    crc32_combine;
+    deflateSetHeader;
+    inflateGetHeader;
+} ZLIB_1.2.0.8;
+
+ZLIB_1.2.2.3 {
+    deflateTune;
+    gzdirect;
+} ZLIB_1.2.2;
+
+ZLIB_1.2.2.4 {
+    inflatePrime;
+} ZLIB_1.2.2.3;
+
+ZLIB_1.2.3.3 {
+    adler32_combine64;
+    crc32_combine64;
+    gzopen64;
+    gzseek64;
+    gztell64;
+    inflateUndermine;
+} ZLIB_1.2.2.4;
+
+ZLIB_1.2.3.4 {
+    inflateReset2;
+    inflateMark;
+} ZLIB_1.2.3.3;
+
+ZLIB_1.2.3.5 {
+    gzbuffer;
+    gzoffset;
+    gzoffset64;
+    gzclose_r;
+    gzclose_w;
+} ZLIB_1.2.3.4;
+
+ZLIB_1.2.5.1 {
+    deflatePending;
+} ZLIB_1.2.3.5;
+
+ZLIB_1.2.5.2 {
+    deflateResetKeep;
+    gzgetc_;
+    inflateResetKeep;
+} ZLIB_1.2.5.1;
+
+ZLIB_1.2.7.1 {
+    inflateGetDictionary;
+    gzvprintf;
+} ZLIB_1.2.5.2;
+
+ZLIB_1.2.9 {
+    inflateCodesUsed;
+    inflateValidate;
+    uncompress2;
+    gzfread;
+    gzfwrite;
+    deflateGetDictionary;
+    adler32_z;
+    crc32_z;
+} ZLIB_1.2.7.1;
+
+ZLIB_1.2.12 {
+    crc32_combine_gen;
+    crc32_combine_gen64;
+    crc32_combine_op;
+} ZLIB_1.2.9;
diff --git a/3rdparty/zlib-ng/zutil.c b/3rdparty/zlib-ng/zutil.c
index 270a28c742..39fbceb4a0 100644
--- a/3rdparty/zlib-ng/zutil.c
+++ b/3rdparty/zlib-ng/zutil.c
@@ -21,7 +21,7 @@ z_const char * const PREFIX(z_errmsg)[10] = {
 };
 
 const char PREFIX3(vstring)[] =
-    " zlib-ng 2.1.6";
+    " zlib-ng 2.2.1";
 
 #ifdef ZLIB_COMPAT
 const char * Z_EXPORT zlibVersion(void) {
@@ -109,51 +109,3 @@ void Z_INTERNAL PREFIX(zcfree)(void *opaque, void *ptr) {
     Z_UNUSED(opaque);
     zng_free(ptr);
 }
-
-/* Since we support custom memory allocators, some which might not align memory as we expect,
- * we have to ask for extra memory and return an aligned pointer. */
-void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align) {
-    uintptr_t return_ptr, original_ptr;
-    uint32_t alloc_size, align_diff;
-    void *ptr;
-
-    /* If no custom calloc function used then call zlib-ng's aligned calloc */
-    if (zalloc == PREFIX(zcalloc))
-        return PREFIX(zcalloc)(opaque, items, size);
-
-    /* Allocate enough memory for proper alignment and to store the original memory pointer */
-    alloc_size = sizeof(void *) + (items * size) + align;
-    ptr = zalloc(opaque, 1, alloc_size);
-    if (!ptr)
-        return NULL;
-
-    /* Calculate return pointer address with space enough to store original pointer */
-    align_diff = align - ((uintptr_t)ptr % align);
-    return_ptr = (uintptr_t)ptr + align_diff;
-    if (align_diff < sizeof(void *))
-        return_ptr += align;
-
-    /* Store the original pointer for free() */
-    original_ptr = return_ptr - sizeof(void *);
-    memcpy((void *)original_ptr, &ptr, sizeof(void *));
-
-    /* Return properly aligned pointer in allocation */
-    return (void *)return_ptr;
-}
-
-void Z_INTERNAL PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr) {
-    /* If no custom cfree function used then call zlib-ng's aligned cfree */
-    if (zfree == PREFIX(zcfree)) {
-        PREFIX(zcfree)(opaque, ptr);
-        return;
-    }
-    if (!ptr)
-        return;
-
-    /* Calculate offset to original memory allocation pointer */
-    void *original_ptr = (void *)((uintptr_t)ptr - sizeof(void *));
-    void *free_ptr = *(void **)original_ptr;
-
-    /* Free original memory allocation */
-    zfree(opaque, free_ptr);
-}
diff --git a/3rdparty/zlib-ng/zutil.h b/3rdparty/zlib-ng/zutil.h
index 663616b44d..a6284502d0 100644
--- a/3rdparty/zlib-ng/zutil.h
+++ b/3rdparty/zlib-ng/zutil.h
@@ -1,7 +1,7 @@
 #ifndef ZUTIL_H_
 #define ZUTIL_H_
 /* zutil.h -- internal interface and configuration of the compression library
- * Copyright (C) 1995-2022 Jean-loup Gailly, Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -24,7 +24,7 @@ typedef unsigned long ulg;
 extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */
 /* (size given to avoid silly warnings with Visual C++) */
 
-#define ERR_MSG(err) PREFIX(z_errmsg)[Z_NEED_DICT-(err)]
+#define ERR_MSG(err) PREFIX(z_errmsg)[(err) < -6 || (err) > 2 ? 9 : 2 - (err)]
 
 #define ERR_RETURN(strm, err) return (strm->msg = ERR_MSG(err), (err))
 /* To be used only when the state is known to be valid */
@@ -103,7 +103,7 @@ extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */
 #  define OS_CODE  6
 #endif
 
-#if defined(MACOS) || defined(TARGET_OS_MAC)
+#if defined(MACOS)
 #  define OS_CODE  7
 #endif
 
@@ -137,12 +137,4 @@ void Z_INTERNAL  PREFIX(zcfree)(void *opaque, void *ptr);
 typedef void *zng_calloc_func(void *opaque, unsigned items, unsigned size);
 typedef void  zng_cfree_func(void *opaque, void *ptr);
 
-void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align);
-void Z_INTERNAL  PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr);
-
-#define ZALLOC(strm, items, size) PREFIX3(alloc_aligned)((strm)->zalloc, (strm)->opaque, (items), (size), 64)
-#define ZFREE(strm, addr)         PREFIX3(free_aligned)((strm)->zfree, (strm)->opaque, (void *)(addr))
-
-#define TRY_FREE(s, p)            {if (p) ZFREE(s, p);}
-
 #endif /* ZUTIL_H_ */
diff --git a/3rdparty/zlib-ng/zutil_p.h b/3rdparty/zlib-ng/zutil_p.h
index caec91d50d..97799f0ce3 100644
--- a/3rdparty/zlib-ng/zutil_p.h
+++ b/3rdparty/zlib-ng/zutil_p.h
@@ -16,15 +16,19 @@
 
 /* Function to allocate 16 or 64-byte aligned memory */
 static inline void *zng_alloc(size_t size) {
-#ifdef HAVE_POSIX_MEMALIGN
+#ifdef HAVE_ALIGNED_ALLOC
+    /* Size must be a multiple of alignment */
+    size = (size + (64 - 1)) & ~(64 - 1);
+    return (void *)aligned_alloc(64, size);  /* Defined in C11 */
+#elif defined(HAVE_POSIX_MEMALIGN)
     void *ptr;
     return posix_memalign(&ptr, 64, size) ? NULL : ptr;
 #elif defined(_WIN32)
     return (void *)_aligned_malloc(size, 64);
 #elif defined(__APPLE__)
-    return (void *)malloc(size);     /* MacOS always aligns to 16 bytes */
-#elif defined(HAVE_ALIGNED_ALLOC)
-    return (void *)aligned_alloc(64, size);
+    /* Fallback for when posix_memalign and aligned_alloc are not available.
+     * On macOS, it always aligns to 16 bytes. */
+    return (void *)malloc(size);
 #else
     return (void *)memalign(64, size);
 #endif
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9320c90dac..3bc9cbe038 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,7 +145,7 @@ if(NOT OPENCV_SKIP_CMAKE_SYSTEM_FILE)
 endif()
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)  # https://cmake.org/cmake/help/latest/variable/CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT.html
-  if(NOT CMAKE_TOOLCHAIN_FILE)
+  if(NOT CMAKE_CROSSCOMPILING)
     if(WIN32)
       set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory" FORCE)
     else()
@@ -508,10 +508,6 @@ OCV_OPTION(OPENCV_ENABLE_MEMORY_SANITIZER "Better support for memory/address san
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CV_GCC )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable compiler options for fast math optimizations on FP computations (not recommended)" OFF)
-if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS) AND CMAKE_CROSSCOMPILING)  # Use CPU_BASELINE instead
-OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) )
-OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) )
-endif()
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
 OCV_OPTION(ANDROID_EXAMPLES_WITH_LIBS "Build binaries of Android examples with native libraries" OFF  IF ANDROID )
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 1d8e98315e..865bfd28a4 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -104,7 +104,7 @@ ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
 ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
 
 ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
-ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)
+ocv_optimization_process_obsolete_option(ENABLE_NEON NEON ON)
 
 ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON)
 
@@ -170,7 +170,29 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
   set(CPU_BASELINE_DETECT ON)
 endif()
 
+# For platforms which don't allow enabling of extra instruction sets with separate compiler options.
+# E.g. GCC/Clang for RISC-V/AArch64 use suffixes for -march option. So we should avoid using existing
+# CPU features mechanisms and rely on cmake-toolchain files or flags provided via command-line.
+macro(ocv_default_baseline_detect_and_check_dispatch)
+  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  if(NOT CPU_BASELINE MATCHES "^(DETECT|NATIVE|)$")
+    message(WARNING "CPU_BASELINE is set to '${CPU_BASELINE}', but '${CMAKE_SYSTEM_PROCESSOR}' "
+                    "platform is designed to work with DETECT|NATIVE|<empty>, "
+                    "otherwise target CPU architecture may be changed unexpectedly. "
+                    "Please check your resulting compiler flags in the CMake output.")
+  endif()
+  foreach(opt ${CPU_DISPATCH})
+    if(NOT DEFINED CPU_${opt}_FLAGS_ON)
+      message(WARNING "${opt} is in the CPU_DISPATCH list, but 'CPU_${opt}_FLAGS_ON' is not set. "
+                      "Please provide feature-specific compiler options explicitly.")
+    endif()
+  endforeach()
+endmacro()
+
+#===================================================================================================
+
 if(X86 OR X86_64)
+
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
 
   ocv_update(CPU_AVX512_COMMON_GROUP "AVX_512F;AVX_512CD")
@@ -347,7 +369,6 @@ elseif(ARM OR AARCH64)
     ocv_update(CPU_FP16_IMPLIES "NEON")
   else()
     ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
-    ocv_update(CPU_NEON_FLAGS_ON "")
     ocv_update(CPU_FP16_IMPLIES "NEON")
     ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
     ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
@@ -361,15 +382,19 @@ elseif(ARM OR AARCH64)
       ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
       ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16")
     endif()
-    set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
     set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}")
+    ocv_default_baseline_detect_and_check_dispatch()
   endif()
+
 elseif(MIPS)
+
   ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA")
   ocv_update(CPU_MSA_FLAGS_ON "-mmsa")
   set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+
 elseif(PPC64LE)
+
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
   ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
   ocv_update(CPU_VSX3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx3.cpp")
@@ -390,9 +415,6 @@ elseif(PPC64LE)
   set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}")
 
 elseif(RISCV)
-  if(NOT DEFINED PLATFORM_STR)
-    set(PLATFORM_STR "rv64gc")
-  endif()
 
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV;FP16;RVV_ZVFH")
   ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp")
@@ -403,13 +425,11 @@ elseif(RISCV)
   ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gc_v")
   ocv_update(CPU_FP16_FLAGS_ON "-march=rv64gc_v_zvfhmin")
   ocv_update(CPU_RVV_ZVFH_FLAGS_ON "-march=rv64gc_v_zvfhmin_zvfh")
-
   ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*")
-
-  set(CPU_DISPATCH "FP16;RVV_ZVFH" CACHE STRING "${HELP_CPU_DISPATCH}")
-  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  ocv_default_baseline_detect_and_check_dispatch()
 
 elseif(LOONGARCH64)
+
   ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp")
   ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
@@ -451,7 +471,7 @@ macro(ocv_check_compiler_optimization OPT)
       set(_varname "")
       if(CPU_${OPT}_TEST_FILE)
         set(__available 0)
-        if(__is_from_baseline OR CPU_BASELINE_DETECT)
+        if(NOT __is_disabled AND (__is_from_baseline OR CPU_BASELINE_DETECT))
           set(_varname "HAVE_CPU_${OPT}_SUPPORT")
           ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
           if(${_varname})
@@ -489,23 +509,6 @@ macro(ocv_check_compiler_optimization OPT)
   endif()
 endmacro()
 
-macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
-  unset(_POSTFIX)
-  # Check each feature option
-  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
-    string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
-    if(NOT ${OPT_FOUND} EQUAL -1)
-      string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
-      string(APPEND _POSTFIX "${TRAILING_PART}")
-      string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}})
-    endif()
-  endforeach()
-  # If more than one option found, merge them
-  if(NOT "x${_POSTFIX}" STREQUAL "x")
-    set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}")
-  endif()
-endmacro()
-
 foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
   if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled")
@@ -588,7 +591,7 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   if(CPU_${OPT}_SUPPORTED)
     if(";${CPU_DISPATCH};" MATCHES ";${OPT};" AND NOT __is_from_baseline)
       list(APPEND CPU_DISPATCH_FINAL ${OPT})
-    elseif(__is_from_baseline)
+    elseif(__is_from_baseline AND NOT __is_disabled)
       if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
         list(APPEND CPU_BASELINE_FINAL ${OPT})
       endif()
@@ -599,15 +602,6 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   endif()
 endforeach()
 
-if(AARCH64)
-  if(NOT MSVC)
-    # Define the list of NEON options to check
-    set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16)
-    set(BASE_ARCHITECTURE "-march=armv8.2-a")
-    ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
-  endif()
-endif()
-
 foreach(OPT ${CPU_BASELINE_REQUIRE})
   if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
     message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index f23bb13dc5..f0d6378bd7 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -77,6 +77,17 @@ macro(add_env_definitions option)
   add_definitions("-D${option}=\"${value}\"")
 endmacro()
 
+# Use same flags for native AArch64 and RISC-V compilation as for cross-compile (Linux)
+if(NOT CMAKE_CROSSCOMPILING AND NOT CMAKE_TOOLCHAIN_FILE AND COMMAND ocv_set_platform_flags)
+  unset(platform_flags)
+  ocv_set_platform_flags(platform_flags)
+  # externally-provided flags should have higher priority - prepend our flags
+  if(platform_flags)
+    set(CMAKE_CXX_FLAGS "${platform_flags} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "${platform_flags} ${CMAKE_C_FLAGS}")
+  endif()
+endif()
+
 if(NOT MSVC)
   # OpenCV fails some tests when 'char' is 'unsigned' by default
   add_extra_compiler_option(-fsigned-char)
diff --git a/cmake/OpenCVMinDepVersions.cmake b/cmake/OpenCVMinDepVersions.cmake
index e13bc154d3..0794ec8c71 100644
--- a/cmake/OpenCVMinDepVersions.cmake
+++ b/cmake/OpenCVMinDepVersions.cmake
@@ -1,5 +1,5 @@
 if(NOT DEFINED MIN_VER_CMAKE)
-  set(MIN_VER_CMAKE 3.5.1)
+  set(MIN_VER_CMAKE 3.7)
 endif()
 set(MIN_VER_CUDA 6.5)
 set(MIN_VER_CUDNN 7.5)
diff --git a/cmake/platforms/OpenCV-Linux.cmake b/cmake/platforms/OpenCV-Linux.cmake
index 1bb8bf6d7f..5f015dfb79 100644
--- a/cmake/platforms/OpenCV-Linux.cmake
+++ b/cmake/platforms/OpenCV-Linux.cmake
@@ -1 +1,9 @@
-# empty
+if((CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    AND NOT CMAKE_CROSSCOMPILING
+    AND NOT CMAKE_TOOLCHAIN_FILE)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") # Maybe use AARCH64 variable?
+    include(${CMAKE_CURRENT_LIST_DIR}/../../platforms/linux/flags-aarch64.cmake)
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+    include(${CMAKE_CURRENT_LIST_DIR}/../../platforms/linux/flags-riscv64.cmake)
+  endif()
+endif()
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index b58e5b2e50..e00ef365ed 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -811,10 +811,17 @@ The function cv::minMaxLoc finds the minimum and maximum element values and thei
 extremums are searched across the whole array or, if mask is not an empty array, in the specified
 array region.
 
-The function do not work with multi-channel arrays. If you need to find minimum or maximum
-elements across all the channels, use Mat::reshape first to reinterpret the array as
-single-channel. Or you may extract the particular channel using either extractImageCOI, or
-mixChannels, or split.
+In C++, if the input is multi-channel, you should omit the minLoc, maxLoc, and mask arguments
+(i.e. leave them as NULL, NULL, and noArray() respectively). These arguments are not
+supported for multi-channel input arrays. If working with multi-channel input and you
+need the minLoc, maxLoc, or mask arguments, then use Mat::reshape first to reinterpret
+the array as single-channel. Alternatively, you can extract the particular channel using either
+extractImageCOI, mixChannels, or split.
+
+In Python, multi-channel input is not supported at all due to a limitation in the
+binding generation process (there is no way to set minLoc and maxLoc to NULL). A
+workaround is to operate on each channel individually or to use NumPy to achieve the same
+functionality.
 @note CV_16F/CV_16BF/CV_Bool/CV_64U/CV_64S/CV_32U are not supported for src.
 @param src input single-channel array.
 @param minVal pointer to the returned minimum value; NULL is used if not required.
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 67aba0bf27..6e033bf67c 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -775,317 +775,15 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
-#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
+#if !CV_SIMD_SCALABLE
     // Compatibility layer
-
+#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
     template<typename T> struct VTraits {
         static inline int vlanes() { return T::nlanes; }
         enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
         using lane_type = typename T::lane_type;
     };
 
-    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
-    inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a + b; \
-    } \
-    inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a - b; \
-    } \
-    template<typename... Args> \
-    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
-        return v_add(f1 + f2, vf...); \
-    }
-    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
-    inline _Tpvec v_shr(const _Tpvec& a, int n) \
-    { \
-        return a >> n; \
-    } \
-    inline _Tpvec v_shl(const _Tpvec& a, int n) \
-    { \
-        return a << n; \
-    }
-
-    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
-    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
-    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
-    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
-    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
-    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
-        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
-        #endif
-    #endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
-    inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a & b; \
-    } \
-    inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a | b; \
-    } \
-    inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a ^ b; \
-    }
-
-    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
-    inline _Tpvec v_not(const _Tpvec& a) \
-    { \
-        return ~a; \
-    }
-
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
-    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
-    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
-    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
-    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
-    OPENCV_HAL_WRAP_NOT_OP(v_int8)
-    OPENCV_HAL_WRAP_NOT_OP(v_int16)
-    OPENCV_HAL_WRAP_NOT_OP(v_int32)
-    OPENCV_HAL_WRAP_NOT_OP(v_int64)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
-        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
-        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
-        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
-        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
-        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
-        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
-        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
-        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
-        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
-        #endif
-    #endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
-    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a * b; \
-    } \
-    template<typename... Args> \
-    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
-        return v_mul(f1 * f2, vf...); \
-    }
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
-        #endif
-    #endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
-    inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a / b; \
-    }
-    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
-        #endif
-    #endif
-
-    #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
-    inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a op b; \
-    }
-    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
-    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a == b; \
-    } \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return a != b; \
-    }
-
-    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
-    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
-
-    OPENCV_HAL_WRAP_CMP(v_uint8)
-    OPENCV_HAL_WRAP_CMP(v_uint16)
-    OPENCV_HAL_WRAP_CMP(v_uint32)
-    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
-    OPENCV_HAL_WRAP_CMP(v_int8)
-    OPENCV_HAL_WRAP_CMP(v_int16)
-    OPENCV_HAL_WRAP_CMP(v_int32)
-    OPENCV_HAL_WRAP_EQ_OP(v_int64)
-    OPENCV_HAL_WRAP_CMP(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_CMP(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-        OPENCV_HAL_WRAP_CMP(v_uint8x16)
-        OPENCV_HAL_WRAP_CMP(v_uint16x8)
-        OPENCV_HAL_WRAP_CMP(v_uint32x4)
-        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
-        OPENCV_HAL_WRAP_CMP(v_int8x16)
-        OPENCV_HAL_WRAP_CMP(v_int16x8)
-        OPENCV_HAL_WRAP_CMP(v_int32x4)
-        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
-        OPENCV_HAL_WRAP_CMP(v_float32x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_CMP(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-        OPENCV_HAL_WRAP_CMP(v_uint8x32)
-        OPENCV_HAL_WRAP_CMP(v_uint16x16)
-        OPENCV_HAL_WRAP_CMP(v_uint32x8)
-        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
-        OPENCV_HAL_WRAP_CMP(v_int8x32)
-        OPENCV_HAL_WRAP_CMP(v_int16x16)
-        OPENCV_HAL_WRAP_CMP(v_int32x8)
-        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
-        OPENCV_HAL_WRAP_CMP(v_float32x8)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_CMP(v_float64x4)
-        #endif
-    #endif
-    OPENCV_HAL_WRAP_CMP_OP(v_int64, lt, <) \
-    OPENCV_HAL_WRAP_CMP_OP(v_int64, gt, >) \
-
-
     //////////// get0 ////////////
     #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
     inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
@@ -1133,6 +831,102 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_GRT0(v_float64x4)
         #endif
     #endif
+#endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
+        return v_add(v_add(f1, f2), f3, vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+    #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16)
+    #endif // (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), f3, vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+    #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16)
+    #endif // (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif
 
     #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
     inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
@@ -1149,6 +943,9 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_EXTRACT(v_uint64)
     OPENCV_HAL_WRAP_EXTRACT(v_int64)
     OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    OPENCV_HAL_WRAP_EXTRACT(v_float16)
+    #endif
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_EXTRACT(v_float64)
     #endif
@@ -1190,6 +987,9 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BROADCAST(v_uint32)
     OPENCV_HAL_WRAP_BROADCAST(v_int32)
     OPENCV_HAL_WRAP_BROADCAST(v_float32)
+    #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
+    OPENCV_HAL_WRAP_BROADCAST(v_float16)
+    #endif
     #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
         OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
         OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
@@ -1203,83 +1003,6 @@ namespace CV__SIMD_NAMESPACE {
 
 #endif //!CV_SIMD_SCALABLE
 
-#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
-// Compatibility layer for the backend that cleaned up.
-    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
-    template<typename... Args> \
-    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
-        return v_add(v_add(f1, f2), vf...); \
-    }
-
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
-    #if CV_SIMD_FP16
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16)
-    #endif
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
-    #endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
-    template<typename... Args> \
-    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
-        return v_mul(v_mul(f1, f2), vf...); \
-    }
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
-    #if CV_SIMD_FP16
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16)
-    #endif
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
-    #endif
-
-    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
-    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
-    { \
-        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
-    }
-
-    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
-    OPENCV_HAL_WRAP_EXTRACT(v_int8)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
-    OPENCV_HAL_WRAP_EXTRACT(v_int16)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
-    OPENCV_HAL_WRAP_EXTRACT(v_int32)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
-    OPENCV_HAL_WRAP_EXTRACT(v_int64)
-    #if CV_SIMD_FP16
-    OPENCV_HAL_WRAP_EXTRACT(v_float16)
-    #endif
-    OPENCV_HAL_WRAP_EXTRACT(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_EXTRACT(v_float64)
-    #endif
-
-    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
-    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
-    { \
-        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
-    }
-
-    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
-    OPENCV_HAL_WRAP_BROADCAST(v_int32)
-    OPENCV_HAL_WRAP_BROADCAST(v_float32)
-
-#endif //CV_NEON
-
 //! @cond IGNORED
 
     // backward compatibility
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index b208479839..3a8505a297 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -673,53 +673,51 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
 
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
-    { return _Tpvec(intrin(a.val, b.val)); }                          \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
-    { a.val = intrin(a.val, b.val); return a; }
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
+    { return _Tpvec(intrin(a.val, b.val)); }
 
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4,   _mm256_sub_epi64)
 
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
 
 // saturating multiply 8-bit, 16-bit
-inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
 {
     v_uint16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
 {
     v_int16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
 {
     __m256i pl = _mm256_mullo_epi16(a.val, b.val);
     __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
@@ -727,7 +725,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
     __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
     return v_uint16x16(_v256_packs_epu32(p0, p1));
 }
-inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
 {
     __m256i pl = _mm256_mullo_epi16(a.val, b.val);
     __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
@@ -735,14 +733,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
     __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
     return v_int16x16(_mm256_packs_epi32(p0, p1));
 }
-inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
-{ a = a * b; return a; }
-inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
-{ a = a * b; return a; }
-inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
-{ a = a * b; return a; }
-inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
-{ a = a * b; return a; }
 
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
@@ -833,13 +823,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                   \
     { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                   \
     { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                   \
     { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                   \
     { return _Tpsvec(srai(a.val, imm)); }                             \
     template<int imm>                                                 \
     inline _Tpuvec v_shl(const _Tpuvec& a)                            \
@@ -867,11 +857,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx
 
 
 /** Bitwise logic **/
-#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
-    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)     \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                            \
     { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
 
 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
@@ -900,29 +890,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
 
 /** Comparison **/
-#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a == b); }                                         \
-    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
-    { return b > a; }                                             \
-    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a < b); }                                          \
-    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
-    { return b >= a; }
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                            \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_eq(a, b)); }                                        \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_gt(b, a); }                                               \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_lt(a, b)); }                                        \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_ge(b, a); }
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
-    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)              \
     { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)              \
     {                                                                    \
         __m256i smask = _mm256_set1_##suffix(sbit);                      \
         return _Tpuvec(_mm256_cmpgt_##suffix(                            \
                        _mm256_xor_si256(a.val, smask),                   \
                        _mm256_xor_si256(b.val, smask)));                 \
     }                                                                    \
-    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)              \
     { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)              \
     { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
@@ -932,30 +922,30 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
-    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)         \
     { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-    { return ~(a == b); }
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)         \
+    { return v_not(v_eq(a, b)); }
 
-inline v_int64x4 operator > (const v_int64x4& a, const v_int64x4& b)
+inline v_int64x4 v_gt(const v_int64x4& a, const v_int64x4& b)
 { return v_int64x4(_mm256_cmpgt_epi64(a.val, b.val)); }
-inline v_int64x4 operator < (const v_int64x4& a, const v_int64x4& b)
+inline v_int64x4 v_lt(const v_int64x4& a, const v_int64x4& b)
 { return v_int64x4(_mm256_cmpgt_epi64(b.val, a.val)); }
 
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 
 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)           \
     { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, suffix)  \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, suffix)  \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, suffix)
 
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
@@ -1221,9 +1211,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
 
 inline int v_reduce_sum(const v_int16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline unsigned v_reduce_sum(const v_uint16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 inline float v_reduce_sum(const v_float32x8& a)
 {
@@ -1278,27 +1268,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
     v_uint32x8 l, h;
-    v_expand(v_add_wrap(a - b, b - a), l, h);
-    return v_reduce_sum(l + h);
+    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
     v_uint32x8 l, h;
     v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
-    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 m = a < b;
-    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+    v_int32x8 m = v_lt(a, b);
+    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
-    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+    return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
 }
 
 /** Popcount **/
@@ -1313,15 +1303,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
 inline v_uint16x16 v_popcount(const v_uint16x16& a)
 {
     v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
 }
 inline v_uint32x8 v_popcount(const v_uint32x8& a)
 {
     v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
 }
 inline v_uint64x4 v_popcount(const v_uint64x4& a)
 {
@@ -1413,9 +1403,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
     inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
     { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, b * b); }                                            \
+    { return v_fma(a, a, v_mul(b, b)); }                                      \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, b*b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
@@ -1424,7 +1414,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
 
 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@@ -1434,16 +1424,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
 
 inline v_float32x8 v_invsqrt(const v_float32x8& x)
 {
-    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
     v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
     // todo: _mm256_fnmsub_ps
-    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
     return t;
 }
 
 inline v_float64x4 v_invsqrt(const v_float64x4& x)
 {
-    return v256_setall_f64(1.) / v_sqrt(x);
+    return v_div(v256_setall_f64(1.), v_sqrt(x));
 }
 
 /** Absolute values **/
@@ -1456,23 +1446,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
 OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
 
 inline v_float32x8 v_abs(const v_float32x8& x)
-{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
 inline v_float64x4 v_abs(const v_float64x4& x)
-{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
 
 /** Absolute difference **/
 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
 {
     v_int8x32 d = v_sub_wrap(a, b);
-    v_int8x32 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x32 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 
 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
@@ -1480,26 +1470,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
 
 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 d = a - b;
-    v_int32x8 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x8 d = v_sub(a, b);
+    v_int32x8 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
 {
-    v_int8x32 d = a - b;
-    v_int8x32 m = a < b;
-    return (d ^ m) - m;
+    v_int8x32 d = v_sub(a, b);
+    v_int8x32 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ////////// Conversions /////////
 
@@ -1794,7 +1784,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1804,7 +1794,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
     return v_int64x4(_mm256_add_epi64(even, odd));
 }
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
@@ -1821,7 +1811,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
     return v_uint32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1836,7 +1826,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
     return v_int32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1860,7 +1850,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
     ));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1876,13 +1866,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
     ));
 }
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1928,7 +1918,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
     return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1939,7 +1929,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
     return v_int64x4(_mm256_add_epi64(lo, hi));
 }
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -1958,7 +1948,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
     v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
     v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
     v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
 }
 
 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -2063,43 +2053,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
 {
     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 32
@@ -2132,43 +2122,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
 {
     // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
 {
     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 64
@@ -2197,28 +2187,28 @@ template<int n> inline
 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
 {
     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
 {
     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
 {
     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
 {
     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // pack boolean
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index e59b8d92eb..64dab6b3ae 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -663,58 +663,56 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
 }
 
 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
-    { return _Tpvec(intrin(a.val, b.val)); }                             \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
-    { a.val = intrin(a.val, b.val); return a; }
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(intrin(a.val, b.val)); }
 
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
 
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
 
 /** Saturating arithmetics **/
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32,  _mm512_subs_epi16)
 
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
 
 // saturating multiply
-inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
 {
     v_uint16x32 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
 {
     v_int16x32 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
 {
     __m512i pl = _mm512_mullo_epi16(a.val, b.val);
     __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
@@ -724,7 +722,7 @@ inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
     const __m512i m = _mm512_set1_epi32(65535);
     return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
 }
-inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
 {
     __m512i pl = _mm512_mullo_epi16(a.val, b.val);
     __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
@@ -733,15 +731,6 @@ inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
     return v_int16x32(_mm512_packs_epi32(p0, p1));
 }
 
-inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
-{ a = a * b; return a; }
-inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
-{ a = a * b; return a; }
-inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
-{ a = a * b; return a; }
-inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
-{ a = a * b; return a; }
-
 inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
 
@@ -802,13 +791,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)               \
     { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)               \
     { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)               \
     { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)               \
     { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
     template<int imm>                                             \
     inline _Tpuvec v_shl(const _Tpuvec& a)                        \
@@ -830,10 +819,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
-    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                               \
     { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
 
 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
@@ -865,16 +854,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                         \
     { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
 
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
@@ -886,16 +875,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                        \
     { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
 
 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
@@ -1250,9 +1239,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
 OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
 
 inline int v_reduce_sum(const v_int16x32& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline uint v_reduce_sum(const v_uint16x32& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
     inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
@@ -1306,17 +1295,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
     return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
 }
 inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
 inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
 { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
 inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
 inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
-{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
 inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
-{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
 inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
-{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
 
 /** Popcount **/
 inline v_uint8x64 v_popcount(const v_int8x64& a)
@@ -1351,8 +1340,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
                                           _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
 #else
     v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
 #endif
 }
 inline v_uint32x16 v_popcount(const v_int32x16& a)
@@ -1361,9 +1350,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
     return v_uint32x16(_mm512_popcnt_epi32(a.val));
 #else
     v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
 #endif
 }
 inline v_uint64x8 v_popcount(const v_int64x8& a)
@@ -1403,9 +1392,9 @@ inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinte
     inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
     { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, b * b); }                                            \
+    { return v_fma(a, a, v_mul(b, b)); }                                      \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, b * b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
@@ -1413,7 +1402,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)
 
 inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
-{ return a * b + c; }
+{ return v_add(v_mul(a, b), c); }
 inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
 { return v_fma(a, b, c); }
 
@@ -1422,9 +1411,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
 #if CV_AVX_512ER
     return v_float32x16(_mm512_rsqrt28_ps(x.val));
 #else
-    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
     v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
-    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
     return t;
 #endif
 }
@@ -1434,7 +1423,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
 #if CV_AVX_512ER
     return v_float64x8(_mm512_rsqrt28_pd(x.val));
 #else
-    return v512_setall_f64(1.) / v_sqrt(x);
+    return v_div(v512_setall_f64(1.), v_sqrt(x));
 //    v_float64x8 half = x * v512_setall_f64(0.5);
 //    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
 //    t *= v512_setall_f64(1.5) - ((t * t) * half);
@@ -1482,17 +1471,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)
 
 /** Absolute difference **/
 inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
 {
     v_int8x64 d = v_sub_wrap(a, b);
-    v_int8x64 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x64 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 
 inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
@@ -1500,26 +1489,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
 
 inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
 {
-    v_int32x16 d = a - b;
-    v_int32x16 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x16 d = v_sub(a, b);
+    v_int32x16 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
 {
-    v_int8x64 d = a - b;
-    v_int8x64 m = a < b;
-    return (d ^ m) - m;
+    v_int8x64 d = v_sub(a, b);
+    v_int8x64 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ////////// Conversions /////////
 
@@ -1818,7 +1807,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
@@ -1828,7 +1817,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
     return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
@@ -1844,7 +1833,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
     return v_uint32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
 {
@@ -1859,7 +1848,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
     return v_int32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
@@ -1883,7 +1872,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
     ));
 }
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
 {
@@ -1893,13 +1882,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
     return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1944,7 +1933,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
     return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
 }
 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
 { return v_dotprod_expand(a, b); }
@@ -1955,7 +1944,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
 { return v_dotprod_expand(a, b); }
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 
 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
@@ -1969,7 +1958,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
     v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
     v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
     v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
 }
 
 inline v_float32x16 v_matmuladd(const v_float32x16& v,
@@ -2070,43 +2059,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
 {
     v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 32
@@ -2139,43 +2128,43 @@ template<int n> inline
 v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
 {
     v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
 {
     v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x16& a)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 64
@@ -2196,28 +2185,28 @@ template<int n> inline
 v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
 {
     v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
 {
     v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
 {
     v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x8& a)
 {
     v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // pack boolean
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index fbc6ad82e5..fed7cc261a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -225,32 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 Element-wise binary and unary operations.
 
 - Arithmetics:
-@ref operator +(const v_reg &a, const v_reg &b) "+",
-@ref operator -(const v_reg &a, const v_reg &b) "-",
-@ref operator *(const v_reg &a, const v_reg &b) "*",
-@ref operator /(const v_reg &a, const v_reg &b) "/",
+@ref v_add(const v_reg &a, const v_reg &b) "+",
+@ref v_sub(const v_reg &a, const v_reg &b) "-",
+@ref v_mul(const v_reg &a, const v_reg &b) "*",
+@ref v_div(const v_reg &a, const v_reg &b) "/",
 @ref v_mul_expand
 
 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
 
 - Bitwise shifts:
-@ref operator <<(const v_reg &a, int s) "<<",
-@ref operator >>(const v_reg &a, int s) ">>",
+@ref v_shl(const v_reg &a, int s) "<<",
+@ref v_shr(const v_reg &a, int s) ">>",
 @ref v_shl, @ref v_shr
 
 - Bitwise logic:
-@ref operator &(const v_reg &a, const v_reg &b) "&",
-@ref operator |(const v_reg &a, const v_reg &b) "|",
-@ref operator ^(const v_reg &a, const v_reg &b) "^",
-@ref operator ~(const v_reg &a) "~"
+@ref v_and(const v_reg &a, const v_reg &b) "&",
+@ref v_or(const v_reg &a, const v_reg &b) "|",
+@ref v_xor(const v_reg &a, const v_reg &b) "^",
+@ref v_not(const v_reg &a) "~"
 
 - Comparison:
-@ref operator >(const v_reg &a, const v_reg &b) ">",
-@ref operator >=(const v_reg &a, const v_reg &b) ">=",
-@ref operator <(const v_reg &a, const v_reg &b) "<",
-@ref operator <=(const v_reg &a, const v_reg &b) "<=",
-@ref operator ==(const v_reg &a, const v_reg &b) "==",
-@ref operator !=(const v_reg &a, const v_reg &b) "!="
+@ref v_gt(const v_reg &a, const v_reg &b) ">",
+@ref v_ge(const v_reg &a, const v_reg &b) ">=",
+@ref v_lt(const v_reg &a, const v_reg &b) "<",
+@ref v_le(const v_reg &a, const v_reg &b) "<=",
+@ref v_eq(const v_reg &a, const v_reg &b) "==",
+@ref v_ne(const v_reg &a, const v_reg &b) "!="
 
 - min/max: @ref v_min, @ref v_max
 
@@ -573,50 +573,43 @@ enum {
 /** @brief Add values
 
 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Subtract values
 
 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Multiply values
 
 For 16- and 32-bit integer types and floating types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Divide values
 
 For floating types only. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 
 /** @brief Bitwise AND
 
 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise OR
 
 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise XOR
 
 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise NOT
 
 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
 
 
 #ifndef CV_DOXYGEN
@@ -639,33 +632,26 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
 
-#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
+#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
 template<int n> inline \
-v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
     return c; \
-} \
-template<int n> inline \
-v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
+#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
 
-CV__HAL_INTRIN_IMPL_BIN_OP(+)
-CV__HAL_INTRIN_IMPL_BIN_OP(-)
-CV__HAL_INTRIN_IMPL_BIN_OP(*)
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
+CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
+CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
+CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
+#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     v_reg<_Tp, n> c; \
     typedef typename V_TypeTraits<_Tp>::int_type itype; \
@@ -673,29 +659,20 @@ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
     return c; \
-} \
-template<int n> CV_INLINE \
-v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
+#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
 
 
-CV__HAL_INTRIN_IMPL_BIT_OP(&)
-CV__HAL_INTRIN_IMPL_BIT_OP(|)
-CV__HAL_INTRIN_IMPL_BIT_OP(^)
+CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
+CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
+CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
 
-#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
+#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
+v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
@@ -703,7 +680,7 @@ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
     return c; \
 } \
 
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
 
 #endif  // !CV_DOXYGEN
 
@@ -760,7 +737,6 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
  * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
  */
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-#define OPENCV_HAL_MATH_HAVE_LOG 1
 
 /**
  * @brief Error function.
@@ -771,9 +747,7 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
 
 //! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-#define OPENCV_HAL_MATH_HAVE_SIN 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-#define OPENCV_HAL_MATH_HAVE_COS 1
 //! @endcond
 
 /** @brief Absolute value of elements
@@ -897,9 +871,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
 template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     typedef typename V_TypeTraits<_Tp>::int_type itype; \
     v_reg<_Tp, n> c; \
@@ -911,28 +885,28 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 /** @brief Less-than comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
 
 /** @brief Greater-than comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
 
 /** @brief Less-than or equal comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
 
 /** @brief Greater-than or equal comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
 
 /** @brief Equal comparison */
-OPENCV_HAL_IMPL_CMP_OP(==)
+OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
 
 /** @brief Not equal comparison */
-OPENCV_HAL_IMPL_CMP_OP(!=)
+OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
 
 template<int n>
 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
@@ -1301,8 +1275,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
@@ -1313,12 +1287,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
 /** @brief Bitwise shift left
 
 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
+OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
 
 /** @brief Bitwise shift right
 
 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
+OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
@@ -2942,7 +2916,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
-{ return a << shift; }
+{ return v_shl(a, shift); }
 
 //! @name Left shift
 //! @{
@@ -2959,7 +2933,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
-{ return a >> shift; }
+{ return v_shr(a, shift); }
 
 //! @name Right shift
 //! @{
@@ -3285,7 +3259,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
 
 
 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
                                                            const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
index db491cc137..45f53de8a2 100644
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@@ -746,53 +746,51 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
 
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin)           \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
-    { return _Tpvec(intrin(a.val, b.val)); }                          \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
-    { a.val = intrin(a.val, b.val); return a; }
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
+    { return _Tpvec(intrin(a.val, b.val)); }
 
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32,  __lasx_xvsadd_bu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32,  __lasx_xvssub_bu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32,   __lasx_xvsadd_b)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32,   __lasx_xvssub_b)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16,  __lasx_xvsadd_h)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16,  __lasx_xvssub_h)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8,  __lasx_xvadd_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8,  __lasx_xvsub_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8,  __lasx_xvmul_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8,   __lasx_xvadd_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8,   __lasx_xvsub_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8,   __lasx_xvmul_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4,  __lasx_xvadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4,  __lasx_xvsub_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4,   __lasx_xvadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4,   __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32,  __lasx_xvsadd_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32,  __lasx_xvssub_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32,   __lasx_xvsadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32,   __lasx_xvssub_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16,  __lasx_xvsadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16,  __lasx_xvssub_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8,  __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8,  __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8,  __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8,   __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8,   __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8,   __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4,  __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4,  __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4,   __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4,   __lasx_xvsub_d)
 
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
 
 // saturating multiply 8-bit, 16-bit
-inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
 {
     v_uint16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
 {
     v_int16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
 {
     __m256i pl = __lasx_xvmul_h(a.val, b.val);
     __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
@@ -800,7 +798,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
     __m256i p1 = __lasx_xvilvh_h(ph, pl);
     return v_uint16x16(_v256_packs_epu32(p0, p1));
 }
-inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
 {
     __m256i pl = __lasx_xvmul_h(a.val, b.val);
     __m256i ph = __lasx_xvmuh_h(a.val, b.val);
@@ -808,14 +806,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
     __m256i p1 = __lasx_xvilvh_h(ph, pl);
     return v_int16x16(_lasx_packs_w(p0, p1));
 }
-inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
-{ a = a * b; return a; }
-inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
-{ a = a * b; return a; }
-inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
-{ a = a * b; return a; }
-inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
-{ a = a * b; return a; }
 
 /** Non-saturating arithmetics **/
 
@@ -904,13 +894,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                             \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                                        \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                               \
     { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                                        \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                               \
     { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                                        \
+    inline _Tpuvec V_shr(const _Tpuvec& a, int imm)                                               \
     { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                                        \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                               \
     { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }                            \
     template<int imm>                                                                             \
     inline _Tpuvec v_shl(const _Tpuvec& a)                                                        \
@@ -932,10 +922,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const)    \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix)   \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
-    inline _Tpvec operator ~ (const _Tpvec& a)                      \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix)  \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                               \
     { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
 
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
@@ -948,16 +938,14 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4,   v, __lasx_xvreplgr2vr_d(-1))
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4,    v, __lasx_xvreplgr2vr_d(-1))
 
 #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)                         \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                            \
-    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }                    \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                              \
-    { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                                      \
+    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
 
 #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast)       \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast)      \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast)       \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast)      \
-    inline _Tpvec operator ~ (const _Tpvec& a)                                     \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast)  \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast)    \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast)  \
+    inline _Tpvec v_not(const _Tpvec& a)                                           \
     { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
 
 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8,  v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
@@ -983,25 +971,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec)                     \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
-    { return ~(a == b); }                                          \
-    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)   \
-    { return b > a; }                                              \
-    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
-    { return ~(a < b); }                                           \
-    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
-    { return b >= a; }
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_not(v_eq(a, b)); }                                  \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_gt(b, a); }                                         \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_not(v_lt(a, b)); }                                  \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_ge(b, a); }
 
 #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)   \
-    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
     { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
-    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)           \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
     {                                                                        \
         return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val));                \
     }                                                                        \
-    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
     { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
-    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
     { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); }                 \
     OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec)                                  \
     OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
@@ -1011,37 +999,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8,  v_int32x8,  w, wu)
 
 #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix)         \
-    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
     { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); }       \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a == b); }
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
+    { return v_not(v_eq(a, b)); }
 
 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
 
 #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)    \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
     { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
 
 #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix)              \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix)     \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix)     \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(<,  xvfcmp_clt, _Tpvec, ssuffix)     \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix)   \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix)   \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt,  xvfcmp_clt, _Tpvec, ssuffix)  \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
 
 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
 
-inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
+inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
 { return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
 
-inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
+inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
 { return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
 
-inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
+inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
 { return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
 
-inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
+inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
 { return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
 
 inline v_float32x8 v_not_nan(const v_float32x8& a)
@@ -1309,9 +1297,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
 
 inline int v_reduce_sum(const v_int16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline unsigned v_reduce_sum(const v_uint16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 inline float v_reduce_sum(const v_float32x8& a)
 {
@@ -1379,27 +1367,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
     v_uint32x8 l, h;
-    v_expand(v_add_wrap(a - b, b - a), l, h);
-    return v_reduce_sum(l + h);
+    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
     v_uint32x8 l, h;
     v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
-    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 m = a < b;
-    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+    v_int32x8 m = v_lt(a, b);
+    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
-    v_float32x8 a_b = a - b;
+    v_float32x8 a_b = v_sub(a, b);
     return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
 }
 
@@ -1503,9 +1491,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
     inline _Tpvec v_sqrt(const _Tpvec& x)                                      \
     { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); }                         \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)            \
-    { return v_fma(a, a, b * b); }                                             \
+    { return v_fma(a, a, v_mul(b, b)); }                                       \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                \
-    { return v_sqrt(v_fma(a, a, b*b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
 OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
@@ -1556,20 +1544,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
 { return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
 
 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
 {
-    v_int8x32 d = a - b;
-    v_int8x32 m = a < b;
-    return (d ^ m) - m;
+    v_int8x32 d = v_sub(a, b);
+    v_int8x32 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ////////// Conversions /////////
 
@@ -1891,7 +1879,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
 
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1915,7 +1903,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
     return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1926,7 +1914,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
     return v_int32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1938,7 +1926,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
     return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1950,13 +1938,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 }
 
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1993,7 +1981,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
     return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
 {
@@ -2004,7 +1992,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
     return v_int64x4(__lasx_xvadd_d(lo, hi));
 }
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -2024,7 +2012,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
     v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
     v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
     v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
 }
 
 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
index 6e3290426f..aa997070c3 100644
--- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
@@ -525,53 +525,51 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
 
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
-    { return _Tpvec(intrin(a.val, b.val)); }                          \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
-    { a.val = intrin(a.val, b.val); return a; }
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }
 
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16,  __lsx_vsadd_bu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16,  __lsx_vssub_bu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16,   __lsx_vsadd_b)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16,   __lsx_vssub_b)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8,  __lsx_vsadd_hu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8,  __lsx_vssub_hu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8,   __lsx_vsadd_h)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8,   __lsx_vssub_h)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4,  __lsx_vadd_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4,  __lsx_vsub_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4,  __lsx_vmul_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4,   __lsx_vadd_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4,   __lsx_vsub_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4,   __lsx_vmul_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2,  __lsx_vadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2,  __lsx_vsub_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2,   __lsx_vadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2,   __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16,  __lsx_vsadd_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16,  __lsx_vssub_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16,   __lsx_vsadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16,   __lsx_vssub_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8,  __lsx_vsadd_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8,  __lsx_vssub_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8,   __lsx_vsadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8,   __lsx_vssub_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4,  __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4,  __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4,  __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4,   __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4,   __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4,   __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2,  __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2,  __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2,   __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2,   __lsx_vsub_d)
 
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
 
 // saturating multiply 8-bit, 16-bit
-inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
+inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
 {
     v_uint16x8 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
+inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
 {
     v_int16x8 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
+inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
 {
     __m128i a0 = a.val, b0 = b.val;
     __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
@@ -580,7 +578,7 @@ inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
     __m128i ph  = __lsx_vilvh_w(pod, pev);
     return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
 }
-inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
+inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
 {
     __m128i a0 = a.val, b0 = b.val;
     __m128i pev = __lsx_vmulwev_w_h(a0, b0);
@@ -589,14 +587,6 @@ inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
     __m128i ph  = __lsx_vilvh_w(pod, pev);
     return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
 }
-inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
-{ a = a * b; return a; }
-inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
-{ a = a * b; return a; }
-inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
-{ a = a * b; return a; }
-inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
-{ a = a * b; return a; }
 
 /** Non-saturating arithmetics **/
 
@@ -681,13 +671,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                           \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                  \
     { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                           \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                  \
     { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                           \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                                  \
     { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                           \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                  \
     { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \
     template<int imm>                                                                \
     inline _Tpuvec v_shl(const _Tpuvec& a)                                           \
@@ -708,10 +698,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix)                       \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix)                        \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix)                       \
-    inline _Tpvec operator ~(const _Tpvec& a)                                        \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix)                   \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix)                     \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix)                   \
+    inline _Tpvec v_not(const _Tpvec& a)                                             \
     { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \
 
 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16,   v)
@@ -724,18 +714,14 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2,   v)
 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2,    v)
 
 #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                 \
-    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }                   \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                   \
-    { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val);                          \
-      a.val = cast(c);                                                               \
-      return a;}
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                           \
+    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
 
 #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast)                  \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast)                   \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast)                  \
-    inline _Tpvec operator ~ (const _Tpvec& a)                                       \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast)              \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast)                \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast)              \
+    inline _Tpvec v_not(const _Tpvec& a)                                             \
     { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \
 
 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
@@ -760,23 +746,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)         \
-    { return ~( a == b ); }                                              \
-    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)         \
-    { return b > a ; }                                                   \
-    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)         \
-    { return ~(a < b); }                                                 \
-    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)         \
-    { return b >= a; }                                                   \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_eq(a, b)); }                                        \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_gt(b, a); }                                               \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_lt(a, b)); }                                        \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_ge(b, a); }                                               \
 
 #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \
-    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
     { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
-    inline _Tpuvec operator >  (const _Tpuvec& a, const _Tpuvec& b)          \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
     { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \
-    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
     { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
-    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
     { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \
     OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \
     OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
@@ -786,37 +772,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8,  v_int16x8,  h, hu)
 OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4,  v_int32x4,  w, wu)
 
 #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \
-    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
     { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a == b); }
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
+    { return v_not(v_eq(a, b)); }
 
 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
 
 #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)       \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                 \
     { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \
 
 #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix)            \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix)            \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(<,  vfcmp_clt, _Tpvec, ssuffix)            \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix)          \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix)          \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt,  vfcmp_clt, _Tpvec, ssuffix)         \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix)          \
 
 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
 
-inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
+inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
 { return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
 
-inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
+inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
 { return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
 
-inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
+inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
 { return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
 
-inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
+inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
 { return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
 
 inline v_float32x4 v_not_nan(const v_float32x4& a)
@@ -1188,7 +1174,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
-    v_float32x4 a_b = a - b;
+    v_float32x4 a_b = v_sub(a, b);
     return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
 }
 
@@ -1295,9 +1281,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
     inline _Tpvec v_sqrt(const _Tpvec& x)                                       \
     { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
-    { return v_fma(a, a, b * b); }                                              \
+    { return v_fma(a, a, v_mul(b, b)); }                                        \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_sqrt(v_fma(a, a, b * b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
 OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
@@ -1349,20 +1335,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 { return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
 
 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = a - b;
-    v_int8x16 m = a < b;
-    return (d ^ m) - m;
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ///////// Conversions /////////
 
@@ -1673,7 +1659,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 }
 
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand(a, b) + c ;}
+{ return v_add(v_dotprod_expand(a, b), c) ;}
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1685,7 +1671,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     return v_int32x4(__lsx_vadd_w(prod0, prod1));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -1698,7 +1684,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(__lsx_vadd_d(prod0, prod1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1710,13 +1696,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(__lsx_vadd_d(prod0, prod1));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 
 ///////// Fast Dot Product //////
@@ -1755,7 +1741,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1767,7 +1753,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(__lsx_vadd_d(lo, hi));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
index 23d6ebd3d1..8d2c22b087 100644
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -345,53 +345,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
-inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)  \
 {                                                            \
     _Tpwvec c, d;                                            \
     v_mul_expand(a, b, c, d);                                \
     return v_pack(c, d);                                     \
-}                                                            \
-inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-{a = a * b; return a; }
+}
 
 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -546,13 +539,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(msa_hadd_s64(prod, prod));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 
 //////// Fast Dot Product ////////
@@ -596,10 +589,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
 { return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
-OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
-OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
-OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
-inline _Tpvec operator ~ (const _Tpvec& a) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix)    \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec v_not(const _Tpvec& a) \
 { \
     return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
 }
@@ -614,21 +607,16 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
 
 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not(const v_float32x4& a)
 {
     return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -659,21 +647,16 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
 
 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not(const v_float64x2& a)
 {
     return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -704,17 +687,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
 
 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
@@ -821,9 +804,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl(const _Tpvec& a, int n) \
 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr(const _Tpvec& a, int n) \
 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
index 5681ae211d..4900418df3 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -373,70 +373,50 @@ inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v,
 
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val, num)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val, num); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
 {
     return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
 }
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
-{
-    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
-    return a;
-}
 
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
-inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
 {
     return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
 }
-inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
-{
-    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
-    return a;
-}
 // TODO: exp, log, sin, cos
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
@@ -562,10 +542,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
-    inline _Tpvec operator ~ (const _Tpvec & a) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num)   \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec v_not(const _Tpvec & a) \
     { \
         return _Tpvec(vnot_v_##suffix(a.val, num)); \
     }
@@ -580,41 +560,31 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not(const v_float32x4& a)
 {
     return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not(const v_float64x2& a)
 {
     return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
@@ -1174,32 +1144,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
 
 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
@@ -1215,37 +1185,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
 
 //TODO: ==
-inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
     return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
     return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
     return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
     return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
     return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
@@ -1259,37 +1229,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a)
 }
 
 //TODO: ==
-inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
     return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
     return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
     return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
     return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
     return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
@@ -1331,13 +1301,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
 
 
 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl(const _Tpvec& a, int n) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
 
 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr(const _Tpvec& a, int n) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
@@ -2037,13 +2007,11 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)       \
     {                                                           \
         auto res = mul(a.val, b.val, num);                      \
         return _Tpvec(cvt(res, 0, num));                        \
-    }                                                           \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)     \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
@@ -2845,7 +2813,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                     const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
@@ -2854,7 +2822,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
 { v_float64x2 res = v_dotprod_expand_fast(a, b);
-  return res + c; }
+  return v_add(res, c); }
 #endif
 ////// FP16 support ///////
 #if __riscv_v == 7000
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 88b67ae250..ee4545db6b 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -735,53 +735,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
     { \
         return _Tpvec(intrin(a.val, b.val)); \
-    } \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-    { \
-        a.val = intrin(a.val, b.val); \
-        return a; \
     }
 
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
@@ -845,7 +838,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
@@ -872,7 +865,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
@@ -886,7 +879,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -899,7 +892,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     return v_int32x4(_mm_add_epi32(p0, p1));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -911,14 +904,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     v_expand(c, c0, c1);
     v_expand(d, d0, d1);
 
-    c0 += c1; d0 += d1;
+    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
     return v_uint64x2(_mm_add_epi64(
         _mm_unpacklo_epi64(c0.val, d0.val),
         _mm_unpackhi_epi64(c0.val, d0.val)
     ));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -931,7 +924,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     ));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
@@ -939,8 +932,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #if CV_SSE4_1
     return v_cvt_f64(v_dotprod(a, b));
 #else
-    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
-    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+    v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
+    v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
 
     return v_float64x2(_mm_add_pd(
         _mm_unpacklo_pd(c.val, d.val),
@@ -949,7 +942,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -957,13 +950,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 { return v_dotprod(a, b); }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_dotprod(a, b); }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod_fast(a, b) + c; }
+{ return v_add(v_dotprod_fast(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
@@ -977,7 +970,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
@@ -994,7 +987,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 #endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1006,34 +999,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     v_expand(c, c0, c1);
     v_expand(d, d0, d1);
 
-    c0 += c1; d0 += d1;
-    return c0 + d0;
+    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
+    return v_add(c0, d0);
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
     v_int32x4 prod = v_dotprod(a, b);
     v_int64x2 c, d;
     v_expand(prod, c, d);
-    return c + d;
+    return v_add(c, d);
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
 
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix)   \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec v_not(const _Tpvec& a) \
     { \
         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
     }
@@ -1182,58 +1175,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
 }
 
 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
-inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
 } \
-inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
 } \
-inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     __m128i not_mask = _mm_set1_epi32(-1); \
     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     __m128i not_mask = _mm_set1_epi32(-1); \
     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
 } \
-inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
 } \
-inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
@@ -1244,17 +1237,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
@@ -1262,26 +1255,28 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
 
 #if CV_SSE4_1
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return ~(a == b); }
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
+{ return v_not(v_eq(a, b)); }
 #else
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
   return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return ~(a == b); }
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+{ return v_not(v_eq(a, b)); }
 #endif
 
-inline v_int64x2 operator > (const v_int64x2& a, const v_int64x2& b)
+inline v_int64x2 v_gt(const v_int64x2& a, const v_int64x2& b)
 {
     __m128i s = _mm_srli_epi64(_mm_sub_epi64(b.val, a.val), 63);
     return v_int64x2(_mm_sub_epi64(_mm_setzero_si128(), s));
 }
-inline v_int64x2 operator < (const v_int64x2& a, const v_int64x2& b)
-{ return b > a; }
+inline v_int64x2 v_lt(const v_int64x2& a, const v_int64x2& b)
+{
+    return v_gt(b, a);
+}
 
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
@@ -1319,17 +1314,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
 /** Absolute difference **/
 
 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
     v_int8x16 d = v_sub_wrap(a, b);
-    v_int8x16 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x16 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1337,25 +1332,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    v_int32x4 d = a - b;
-    v_int32x4 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x4 d = v_sub(a, b);
+    v_int32x4 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = a - b;
-    v_int8x16 m = a < b;
-    return (d ^ m) - m;
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
  }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1389,12 +1384,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpvec res = v_fma(a, a, b*b); \
+    _Tpvec res = v_fma(a, a, v_mul(b, b)); \
     return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    return v_fma(a, a, b*b); \
+    return v_fma(a, a, v_mul(b, b)); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
@@ -1405,19 +1400,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
 
 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(srai(a.val, imm)); \
 } \
@@ -1719,9 +1714,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
 
 inline int v_reduce_sum(const v_int16x8& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline unsigned v_reduce_sum(const v_uint16x8& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 inline uint64 v_reduce_sum(const v_uint64x2& a)
 {
@@ -1778,13 +1773,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1813,15 +1808,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
 inline v_uint16x8 v_popcount(const v_uint16x8& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
 }
 inline v_uint32x4 v_popcount(const v_uint32x4& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index e66563bede..fbe690461a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -513,48 +513,44 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
 /* Element-wise binary and unary operations */
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin)       \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(intrin(a.val, b.val)); }                         \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)   \
-{ a.val = intrin(a.val, b.val); return a; }
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(intrin(a.val, b.val)); }
 
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16,  vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
 
 // saturating multiply
 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -596,9 +592,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
-inline _Tpvec operator << (const _Tpvec& a, int imm)         \
+inline _Tpvec v_shl(const _Tpvec& a, int imm)                \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
-inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
+inline _Tpvec v_shr(const _Tpvec& a, int imm)                \
 { return _Tpvec(shr(a.val, splfunc(imm))); }                 \
 template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
@@ -617,10 +613,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
-OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and)  \
-OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or)   \
-OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor)  \
-inline _Tpvec operator ~ (const _Tpvec& a)      \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and)  \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or)    \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor)  \
+inline _Tpvec v_not(const _Tpvec& a)                \
 { return _Tpvec(vec_not(a.val)); }
 
 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
@@ -650,17 +646,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec)                 \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpeq(a.val, b.val)); }                    \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec V_ne(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpne(a.val, b.val)); }                    \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b)    \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmplt(a.val, b.val)); }                    \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b)    \
+inline _Tpvec V_gt(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpgt(a.val, b.val)); }                    \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmple(a.val, b.val)); }                    \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpge(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
@@ -1060,7 +1056,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
-{ return a * b + c; }
+{ return v_add(v_mul(a,  b), c); }
 
 // TODO: exp, log, sin, cos
 
@@ -1089,12 +1085,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
-{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
+{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
 
 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Absolute difference for signed integers **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
@@ -1442,7 +1438,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
     return v_int64x2(vec_add(even, odd));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -1485,7 +1481,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(vec_add(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1495,13 +1491,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1531,7 +1527,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1544,10 +1540,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     v_int32x4 prod = v_dotprod(a, b);
     v_int64x2 c, d;
     v_expand(prod, c, d);
-    return c + d;
+    return v_add(c, d);
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index 5d470d9419..3a8069ca91 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -849,53 +849,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint8x16, wasm_u8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint8x16, wasm_u8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int8x16, wasm_i8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int8x16, wasm_i8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint16x8, wasm_u16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint16x8, wasm_u16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int16x8, wasm_i16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int16x8, wasm_i16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_uint32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_int32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float32x4, wasm_f32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float32x4, wasm_f32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float32x4, wasm_f32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float32x4, wasm_f32x4_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float64x2, wasm_f64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float64x2, wasm_f64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float64x2, wasm_f64x2_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float64x2, wasm_f64x2_div)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
-inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
 {                                                            \
     _Tpwvec c, d;                                            \
     v_mul_expand(a, b, c, d);                                \
     return v_pack(c, d);                                     \
-}                                                            \
-inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-{ a = a * b; return a; }
+}
 
 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
@@ -986,7 +979,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 }
 
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
@@ -1000,7 +993,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    return v_dotprod(a, b) + c;
+    return v_add(v_dotprod(a, b), c);
 }
 
 // 8 >> 32
@@ -1010,13 +1003,13 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     v128_t a1 = wasm_u16x8_shr(a.val, 8);
     v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
     v128_t b1 = wasm_u16x8_shr(b.val, 8);
-    return v_uint32x4((
-        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
-        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    return v_uint32x4((v_add(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)),
+        v_dotprod(v_int16x8(a1), v_int16x8(b1)))).val
     );
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1024,13 +1017,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     v128_t a1 = wasm_i16x8_shr(a.val, 8);
     v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
     v128_t b1 = wasm_i16x8_shr(b.val, 8);
-    return v_int32x4(
-        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+    return v_int32x4(v_add(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)),
         v_dotprod(v_int16x8(a1), v_int16x8(b1))
-    );
+    ));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -1039,13 +1032,13 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     v128_t a1 = wasm_u32x4_shr(a.val, 16);
     v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
     v128_t b1 = wasm_u32x4_shr(b.val, 16);
-    return v_uint64x2((
-        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+    return v_uint64x2((v_add(
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)),
         v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
-    );
+    ));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1053,20 +1046,20 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     v128_t a1 = wasm_i32x4_shr(a.val, 16);
     v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
     v128_t b1 = wasm_i32x4_shr(b.val, 16);
-    return v_int64x2((
-        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+    return v_int64x2((v_add(
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)),
         v_dotprod(v_int32x4(a1), v_int32x4(b1)))
-    );
+    ));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1109,10 +1102,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
 { return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
-inline _Tpvec operator ~ (const _Tpvec& a) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_and, _Tpvec, wasm_v128_and) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_or, _Tpvec, wasm_v128_or)   \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_xor, _Tpvec, wasm_v128_xor) \
+inline _Tpvec v_not(const _Tpvec& a) \
 { \
     return _Tpvec(wasm_v128_not(a.val)); \
 }
@@ -1215,17 +1208,17 @@ OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)  \
 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)  \
 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
@@ -1238,10 +1231,10 @@ OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
 
 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_eq(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_ne(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); }
 
 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
@@ -1299,17 +1292,17 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
 /** Absolute difference **/
 
 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
     v_int8x16 d = v_sub_wrap(a, b);
-    v_int8x16 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x16 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1317,25 +1310,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    v_int32x4 d = a - b;
-    v_int32x4 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x4 d = v_sub(a, b);
+    v_int32x4 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = a - b;
-    v_int8x16 m = a < b;
-    return (d ^ m) - m;
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
  }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1345,12 +1338,12 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x
 
 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
@@ -1386,19 +1379,19 @@ OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
 
 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
 } \
-inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
 } \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
 } \
-inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
 } \
@@ -1694,7 +1687,7 @@ inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
     v_expand(v_absdiff(a, b), l16, h16);
     v_expand(l16, l16_l32, l16_h32);
     v_expand(h16, h16_l32, h16_h32);
-    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+    return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1703,19 +1696,19 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
     v_expand(v_absdiff(a, b), l16, h16);
     v_expand(l16, l16_l32, l16_h32);
     v_expand(h16, h16_l32, h16_h32);
-    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+    return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1744,15 +1737,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
 inline v_uint16x8 v_popcount(const v_uint16x8& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
 }
 inline v_uint32x4 v_popcount(const v_uint32x4& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 0452d46e83..0d115d6595 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1801,7 +1801,7 @@ INSTANTIATE_TEST_CASE_P(Core_CartToPolarToCart, ElemWiseTest, ::testing::Values(
 
 // Mixed Type Arithmetic Operations
 
-typedef std::tuple<ElemWiseOpPtr, std::tuple<cvtest::MatDepth, cvtest::MatDepth>> SomeType;
+typedef std::tuple<ElemWiseOpPtr, std::tuple<cvtest::MatDepth, cvtest::MatDepth>, int> SomeType;
 class ArithmMixedTest : public ::testing::TestWithParam<SomeType> {};
 
 TEST_P(ArithmMixedTest, accuracy)
@@ -1810,7 +1810,10 @@ TEST_P(ArithmMixedTest, accuracy)
     ElemWiseOpPtr op = std::get<0>(p);
     int srcDepth = std::get<0>(std::get<1>(p));
     int dstDepth = std::get<1>(std::get<1>(p));
+    int channels = std::get<2>(p);
 
+    int srcType = CV_MAKETYPE(srcDepth, channels);
+    int dstType = CV_MAKETYPE(dstDepth, channels);
     op->flags |= BaseElemWiseOp::MIXED_TYPE;
     int testIdx = 0;
     RNG rng((uint64)ARITHM_RNG_SEED);
@@ -1825,15 +1828,15 @@ TEST_P(ArithmMixedTest, accuracy)
         int ninputs = op->ninputs;
         vector<Mat> src(ninputs);
         for(int i = 0; i < ninputs; i++ )
-            src[i] = cvtest::randomMat(rng, size, srcDepth, minval, maxval, true);
+            src[i] = cvtest::randomMat(rng, size, srcType, minval, maxval, true);
         Mat dst0, dst, mask;
         if( haveMask )
         {
             mask = cvtest::randomMat(rng, size, CV_8UC1, 0, 2, true);
         }
 
-        dst0 = cvtest::randomMat(rng, size, dstDepth, minval, maxval, false);
-        dst = cvtest::randomMat(rng, size, dstDepth, minval, maxval, true);
+        dst0 = cvtest::randomMat(rng, size, dstType, minval, maxval, false);
+        dst = cvtest::randomMat(rng, size, dstType, minval, maxval, true);
         cvtest::copy(dst, dst0);
 
         op->generateScalars(dstDepth, rng);
@@ -1853,53 +1856,62 @@ INSTANTIATE_TEST_CASE_P(Core_AddMixed, ArithmMixedTest,
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_AddScalarMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddSOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_AddWeightedMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddWeightedOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_SubMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_SubScalarMinusArgMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubRSOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_MulMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_MulScalarMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulSOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_DivMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new DivOp)),
                                            ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
                                                              std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 INSTANTIATE_TEST_CASE_P(Core_RecipMixed, ArithmMixedTest,
                         ::testing::Combine(::testing::Values(ElemWiseOpPtr(new RecipOp)),
-                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
-                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F}),
+                                           ::testing::Values(1, 3, 4)));
 
 TEST(Core_ArithmMask, uninitialized)
 {
diff --git a/modules/dnn/src/tflite/tflite_importer.cpp b/modules/dnn/src/tflite/tflite_importer.cpp
index 92bfeeef65..7e7f1d0503 100644
--- a/modules/dnn/src/tflite/tflite_importer.cpp
+++ b/modules/dnn/src/tflite/tflite_importer.cpp
@@ -271,7 +271,7 @@ TFLiteImporter::DispatchMap TFLiteImporter::buildDispatchMap()
     dispatch["DEPTHWISE_CONV_2D"] = &TFLiteImporter::parseDWConvolution;
     dispatch["ADD"] = dispatch["MUL"] = &TFLiteImporter::parseEltwise;
     dispatch["RELU"] = dispatch["PRELU"] = dispatch["HARD_SWISH"] =
-        dispatch["LOGISTIC"] = &TFLiteImporter::parseActivation;
+        dispatch["LOGISTIC"] = dispatch["LEAKY_RELU"] = &TFLiteImporter::parseActivation;
     dispatch["MAX_POOL_2D"] = dispatch["AVERAGE_POOL_2D"] = &TFLiteImporter::parsePooling;
     dispatch["MaxPoolingWithArgmax2D"] = &TFLiteImporter::parsePoolingWithArgmax;
     dispatch["MaxUnpooling2D"] = &TFLiteImporter::parseUnpooling;
@@ -1029,6 +1029,7 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco
 }
 
 void TFLiteImporter::parseActivation(const Operator& op, const std::string& opcode, LayerParams& activParams, bool isFused) {
+    float slope = 0.;
     if (opcode == "NONE")
         return;
     else if (opcode == "RELU6")
@@ -1041,6 +1042,13 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco
         activParams.type = "HardSwish";
     else if (opcode == "LOGISTIC")
         activParams.type = "Sigmoid";
+    else if (opcode == "LEAKY_RELU")
+    {
+        activParams.type = "ReLU";
+        auto options = reinterpret_cast<const LeakyReluOptions*>(op.builtin_options());
+        slope = options->alpha();
+        activParams.set("negative_slope", slope);
+    }
     else
         CV_Error(Error::StsNotImplemented, "Unsupported activation " + opcode);
 
@@ -1072,6 +1080,8 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco
                 y = 1.0f / (1.0f + std::exp(-x));
             else if (opcode == "HARD_SWISH")
                 y = x * max(0.f, min(1.f, x / 6.f + 0.5f));
+            else if (opcode == "LEAKY_RELU")
+                y = x >= 0.f ? x : slope*x;
             else
                 CV_Error(Error::StsNotImplemented, "Lookup table for " + opcode);
 
diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp
index 31e30ea724..6f21fe34ef 100644
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@@ -271,6 +271,10 @@ TEST_P(Test_TFLite, global_max_pooling_2d) {
     testLayer("global_max_pooling_2d");
 }
 
+TEST_P(Test_TFLite, leakyRelu) {
+    testLayer("leakyRelu");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_TFLite, dnnBackendsAndTargets());
 
 }}  // namespace
diff --git a/modules/features2d/src/fast.avx2.cpp b/modules/features2d/src/fast.avx2.cpp
index 72e7d66924..3d408a03df 100644
--- a/modules/features2d/src/fast.avx2.cpp
+++ b/modules/features2d/src/fast.avx2.cpp
@@ -157,7 +157,7 @@ public:
                             q0 = v_max(q0, v_min(a, v0_));
                             q1 = v_min(q1, v_max(b, v0_));
                         }
-                        q0 = v_max(q0, v_setzero_s16() - q1);
+                        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
                         curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
                     }
                 }
diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index 2c5cf9f997..76ef3082ea 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -150,7 +150,7 @@ void findScaleSpaceExtrema(
 
 void calcSIFTDescriptor(
         const Mat& img, Point2f ptf, float ori, float scl,
-        int d, int n, Mat& dst, int row
+        const int d, const int n, Mat& dst, int row
 );
 
 
@@ -708,7 +708,7 @@ void findScaleSpaceExtrema(
 
 void calcSIFTDescriptor(
         const Mat& img, Point2f ptf, float ori, float scl,
-        int d, int n, Mat& dstMat, int row
+        const int d, const int n, Mat& dstMat, int row
 )
 {
     CV_TRACE_FUNCTION();
@@ -725,7 +725,10 @@ void calcSIFTDescriptor(
     cos_t /= hist_width;
     sin_t /= hist_width;
 
-    int i, j, k, len = (radius*2+1)*(radius*2+1), histlen = (d+2)*(d+2)*(n+2);
+    int i, j, k;
+    const int len = (radius*2+1)*(radius*2+1);
+    const int len_hist = (d+2)*(d+2)*(n+2);
+    const int len_ddn = d * d * n;
     int rows = img.rows, cols = img.cols;
 
     cv::utils::BufferArea area;
@@ -736,8 +739,8 @@ void calcSIFTDescriptor(
     area.allocate(W, len, CV_SIMD_WIDTH);
     area.allocate(RBin, len, CV_SIMD_WIDTH);
     area.allocate(CBin, len, CV_SIMD_WIDTH);
-    area.allocate(hist, histlen, CV_SIMD_WIDTH);
-    area.allocate(rawDst, len, CV_SIMD_WIDTH);
+    area.allocate(hist, len_hist, CV_SIMD_WIDTH);
+    area.allocate(rawDst, len_ddn, CV_SIMD_WIDTH);
     area.commit();
     Mag = Y;
 
@@ -771,10 +774,10 @@ void calcSIFTDescriptor(
             }
         }
 
-    len = k;
-    cv::hal::fastAtan2(Y, X, Ori, len, true);
-    cv::hal::magnitude32f(X, Y, Mag, len);
-    cv::hal::exp32f(W, W, len);
+    const int len_left = k;
+    cv::hal::fastAtan2(Y, X, Ori, len_left, true);
+    cv::hal::magnitude32f(X, Y, Mag, len_left);
+    cv::hal::exp32f(W, W, len_left);
 
     k = 0;
 #if (CV_SIMD || CV_SIMD_SCALABLE)
@@ -788,7 +791,7 @@ void calcSIFTDescriptor(
         const v_int32 __1 = vx_setall_s32(1);
         const v_int32 __d_plus_2 = vx_setall_s32(d+2);
         const v_int32 __n_plus_2 = vx_setall_s32(n+2);
-        for( ; k <= len - vecsize; k += vecsize )
+        for( ; k <= len_left - vecsize; k += vecsize )
         {
             v_float32 rbin = vx_load_aligned(RBin + k);
             v_float32 cbin = vx_load_aligned(CBin + k);
@@ -839,7 +842,7 @@ void calcSIFTDescriptor(
         }
     }
 #endif
-    for( ; k < len; k++ )
+    for( ; k < len_left; k++ )
     {
         float rbin = RBin[k], cbin = CBin[k];
         float obin = (Ori[k] - ori)*bins_per_rad;
@@ -892,13 +895,12 @@ void calcSIFTDescriptor(
     // and scale the result, so that it can be easily converted
     // to byte array
     float nrm2 = 0;
-    len = d*d*n;
     k = 0;
 #if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_float32 __nrm2 = vx_setzero_f32();
         v_float32 __rawDst;
-        for( ; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
+        for( ; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
         {
             __rawDst = vx_load_aligned(rawDst + k);
             __nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
@@ -906,10 +908,10 @@ void calcSIFTDescriptor(
         nrm2 = (float)v_reduce_sum(__nrm2);
     }
 #endif
-    for( ; k < len; k++ )
+    for( ; k < len_ddn; k++ )
         nrm2 += rawDst[k]*rawDst[k];
 
-    float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR;
+    const float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR;
 
     i = 0, nrm2 = 0;
 #if 0 //CV_AVX2
@@ -920,7 +922,7 @@ void calcSIFTDescriptor(
         __m256 __dst;
         __m256 __nrm2 = _mm256_setzero_ps();
         __m256 __thr = _mm256_set1_ps(thr);
-        for( ; i <= len - 8; i += 8 )
+        for( ; i <= len_ddn - 8; i += 8 )
         {
             __dst = _mm256_loadu_ps(&rawDst[i]);
             __dst = _mm256_min_ps(__dst, __thr);
@@ -936,7 +938,7 @@ void calcSIFTDescriptor(
                nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
     }
 #endif
-    for( ; i < len; i++ )
+    for( ; i < len_ddn; i++ )
     {
         float val = std::min(rawDst[i], thr);
         rawDst[i] = val;
@@ -954,7 +956,7 @@ if( dstMat.type() == CV_32F )
     v_float32 __min = vx_setzero_f32();
     v_float32 __max = vx_setall_f32(255.0f); // max of uchar
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
+    for( k = 0; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         __dst = vx_load_aligned(rawDst + k);
         __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max);
@@ -965,7 +967,7 @@ if( dstMat.type() == CV_32F )
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"  // iteration XX invokes undefined behavior
 #endif
-    for( ; k < len; k++ )
+    for( ; k < len_ddn; k++ )
     {
         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
     }
@@ -980,7 +982,7 @@ else // CV_8U
     v_float32 __dst0, __dst1;
     v_uint16 __pack01;
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
+    for( k = 0; k <= len_ddn - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
     {
         __dst0 = vx_load_aligned(rawDst + k);
         __dst1 = vx_load_aligned(rawDst + k + VTraits<v_float32>::vlanes());
@@ -994,7 +996,7 @@ else // CV_8U
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"  // iteration XX invokes undefined behavior
 #endif
-    for( ; k < len; k++ )
+    for( ; k < len_ddn; k++ )
     {
         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
     }
@@ -1004,7 +1006,7 @@ else // CV_8U
 }
 #else
     float nrm1 = 0;
-    for( k = 0; k < len; k++ )
+    for( k = 0; k < len_ddn; k++ )
     {
         rawDst[k] *= nrm2;
         nrm1 += rawDst[k];
@@ -1013,7 +1015,7 @@ else // CV_8U
     if( dstMat.type() == CV_32F )
     {
         float *dst = dstMat.ptr<float>(row);
-        for( k = 0; k < len; k++ )
+        for( k = 0; k < len_ddn; k++ )
         {
             dst[k] = std::sqrt(rawDst[k] * nrm1);
         }
@@ -1021,7 +1023,7 @@ else // CV_8U
     else // CV_8U
     {
         uint8_t *dst = dstMat.ptr<uint8_t>(row);
-        for( k = 0; k < len; k++ )
+        for( k = 0; k < len_ddn; k++ )
         {
             dst[k] = saturate_cast<uchar>(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
         }
diff --git a/modules/features2d/test/test_sift.cpp b/modules/features2d/test/test_sift.cpp
index 731b31ac0f..d98f1c6b8a 100644
--- a/modules/features2d/test/test_sift.cpp
+++ b/modules/features2d/test/test_sift.cpp
@@ -30,5 +30,17 @@ TEST(Features2d_SIFT, descriptor_type)
     ASSERT_EQ(countNonZero(diff), 0) << "descriptors are not identical";
 }
 
+TEST(Features2d_SIFT, regression_26139)
+{
+    auto extractor = cv::SIFT::create();
+    cv::Mat1b image{cv::Size{300, 300}, 0};
+    std::vector<cv::KeyPoint> kps {
+        cv::KeyPoint(154.076813f, 136.160904f, 111.078636f, 216.195618f, 0.00000899323549f, 7)
+    };
+    cv::Mat descriptors;
+    extractor->compute(image, kps, descriptors); // we expect no memory corruption
+    ASSERT_EQ(descriptors.size(), Size(128, 1));
+}
+
 
 }} // namespace
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 77f091d55a..9063aafe2c 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -222,9 +222,9 @@ enum ImwriteHDRCompressionFlags {
 
 @anchor imread
 
-The function imread loads an image from the specified file and returns it. If the image cannot be
-read (because of missing file, improper permissions, unsupported or invalid format), the function
-returns an empty matrix ( Mat::data==NULL ).
+The `imread` function loads an image from the specified file and returns OpenCV matrix. If the image cannot be
+read (because of a missing file, improper permissions, or unsupported/invalid format), the function
+returns an empty matrix.
 
 Currently, the following file formats are supported:
 
@@ -234,7 +234,7 @@ Currently, the following file formats are supported:
 -   Portable Network Graphics - \*.png (see the *Note* section)
 -   WebP - \*.webp (see the *Note* section)
 -   AVIF - \*.avif (see the *Note* section)
--   Portable image format - \*.pbm, \*.pgm, \*.ppm \*.pxm, \*.pnm (always supported)
+-   Portable image format - \*.pbm, \*.pgm, \*.ppm, \*.pxm, \*.pnm (always supported)
 -   PFM files - \*.pfm (see the *Note* section)
 -   Sun rasters - \*.sr, \*.ras (always supported)
 -   TIFF files - \*.tiff, \*.tif (see the *Note* section)
@@ -243,32 +243,31 @@ Currently, the following file formats are supported:
 -   Raster and Vector geospatial data supported by GDAL (see the *Note* section)
 
 @note
--   The function determines the type of an image by the content, not by the file extension.
+-   The function determines the type of an image by its content, not by the file extension.
 -   In the case of color images, the decoded images will have the channels stored in **B G R** order.
 -   When using IMREAD_GRAYSCALE, the codec's internal grayscale conversion will be used, if available.
-    Results may differ to the output of cvtColor()
--   On Microsoft Windows\* OS and MacOSX\*, the codecs shipped with an OpenCV image (libjpeg,
-    libpng, libtiff, and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs,
-    and TIFFs. On MacOSX, there is also an option to use native MacOSX image readers. But beware
-    that currently these native image loaders give images with different pixel values because of
-    the color management embedded into MacOSX.
--   On Linux\*, BSD flavors and other Unix-like open-source operating systems, OpenCV looks for
-    codecs supplied with an OS image. Install the relevant packages (do not forget the development
-    files, for example, "libjpeg-dev", in Debian\* and Ubuntu\*) to get the codec support or turn
+    Results may differ from the output of cvtColor().
+-   On Microsoft Windows\* and Mac OS\*, the codecs shipped with OpenCV (libjpeg, libpng, libtiff,
+    and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs, and TIFFs. On Mac OS,
+    there is also an option to use native Mac OS image readers. However, beware that currently these
+    native image loaders give images with different pixel values because of the color management embedded
+    into Mac OS.
+-   On Linux\*, BSD flavors, and other Unix-like open-source operating systems, OpenCV looks for
+    codecs supplied with the OS. Ensure the relevant packages are installed (including development
+    files, such as "libjpeg-dev" in Debian\* and Ubuntu\*) to get codec support, or turn
     on the OPENCV_BUILD_3RDPARTY_LIBS flag in CMake.
--   In the case you set *WITH_GDAL* flag to true in CMake and @ref IMREAD_LOAD_GDAL to load the image,
-    then the [GDAL](http://www.gdal.org) driver will be used in order to decode the image, supporting
-    the following formats: [Raster](http://www.gdal.org/formats_list.html),
-    [Vector](http://www.gdal.org/ogr_formats.html).
--   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account
-    and thus the image will be rotated accordingly except if the flags @ref IMREAD_IGNORE_ORIENTATION
+-   If the *WITH_GDAL* flag is set to true in CMake and @ref IMREAD_LOAD_GDAL is used to load the image,
+    the [GDAL](http://www.gdal.org) driver will be used to decode the image, supporting
+    [Raster](http://www.gdal.org/formats_list.html) and [Vector](http://www.gdal.org/ogr_formats.html) formats.
+-   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account,
+    and thus the image will be rotated accordingly unless the flags @ref IMREAD_IGNORE_ORIENTATION
     or @ref IMREAD_UNCHANGED are passed.
--   Use the IMREAD_UNCHANGED flag to keep the floating point values from PFM image.
--   By default number of pixels must be less than 2^30. Limit can be set using system
-    variable OPENCV_IO_MAX_IMAGE_PIXELS
+-   Use the IMREAD_UNCHANGED flag to preserve the floating-point values from PFM images.
+-   By default, the number of pixels must be less than 2^30. This limit can be changed by setting
+    the environment variable `OPENCV_IO_MAX_IMAGE_PIXELS`. See @ref tutorial_env_reference.
 
-@param filename Name of file to be loaded.
-@param flags Flag that can take values of cv::ImreadModes
+@param filename Name of the file to be loaded.
+@param flags Flag that can take values of `cv::ImreadModes`.
 */
 CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR_BGR );
 
diff --git a/modules/imgcodecs/src/bitstrm.cpp b/modules/imgcodecs/src/bitstrm.cpp
index a8f91aa4dd..bb92d8a73b 100644
--- a/modules/imgcodecs/src/bitstrm.cpp
+++ b/modules/imgcodecs/src/bitstrm.cpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
 #include "bitstrm.hpp"
@@ -49,11 +11,6 @@ namespace cv
 
 const int BS_DEF_BLOCK_SIZE = 1<<15;
 
-bool  bsIsBigEndian( void )
-{
-    return (((const int*)"\0\x1\x2\x3\x4\x5\x6\x7")[0] & 255) != 0;
-}
-
 /////////////////////////  RBaseStream ////////////////////////////
 
 bool  RBaseStream::isOpened()
diff --git a/modules/imgcodecs/src/bitstrm.hpp b/modules/imgcodecs/src/bitstrm.hpp
index dd78d5d3d6..391ade503d 100644
--- a/modules/imgcodecs/src/bitstrm.hpp
+++ b/modules/imgcodecs/src/bitstrm.hpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html
 
 #ifndef _BITSTRM_H_
 #define _BITSTRM_H_
@@ -183,13 +145,6 @@ public:
     bool putDWord( int val );
 };
 
-inline unsigned BSWAP(unsigned v)
-{
-    return (v<<24)|((v&0xff00)<<8)|((v>>8)&0xff00)|((unsigned)v>>24);
-}
-
-bool bsIsBigEndian( void );
-
 }
 
 #endif/*_BITSTRM_H_*/
diff --git a/modules/imgcodecs/src/grfmt_base.hpp b/modules/imgcodecs/src/grfmt_base.hpp
index 9ae23b24df..f6b5ba1b27 100644
--- a/modules/imgcodecs/src/grfmt_base.hpp
+++ b/modules/imgcodecs/src/grfmt_base.hpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html
 
 #ifndef _GRFMT_BASE_H_
 #define _GRFMT_BASE_H_
@@ -55,75 +17,228 @@ class BaseImageEncoder;
 typedef Ptr<BaseImageEncoder> ImageEncoder;
 typedef Ptr<BaseImageDecoder> ImageDecoder;
 
-///////////////////////////////// base class for decoders ////////////////////////
-class BaseImageDecoder
-{
+/**
+ * @brief Base class for image decoders.
+ *
+ * The BaseImageDecoder class provides an abstract interface for decoding various image formats.
+ * It defines common functionality like setting the image source, reading image headers,
+ * and handling EXIF metadata. Derived classes must implement methods for reading image headers
+ * and image data to handle format-specific decoding logic.
+ */
+
+class BaseImageDecoder {
 public:
+    /**
+     * @brief Constructor for BaseImageDecoder.
+     * Initializes the object and sets default values for member variables.
+     */
     BaseImageDecoder();
+
+    /**
+     * @brief Virtual destructor for BaseImageDecoder.
+     * Ensures proper cleanup of derived classes when deleted via a pointer to BaseImageDecoder.
+     */
     virtual ~BaseImageDecoder() {}
 
+    /**
+     * @brief Get the width of the image.
+     * @return The width of the image (in pixels).
+     */
     int width() const { return m_width; }
+
+    /**
+     * @brief Get the height of the image.
+     * @return The height of the image (in pixels).
+     */
     int height() const { return m_height; }
+
+    /**
+     * @brief Get the number of frames in the image or animation.
+     * @return The number of frames in the image.
+     */
     size_t getFrameCount() const { return m_frame_count; }
+
+    /**
+     * @brief Get the type of the image (e.g., color format, depth).
+     * @return The type of the image.
+     */
     virtual int type() const { return m_type; }
 
+    /**
+     * @brief Fetch a specific EXIF tag from the image's metadata.
+     * @param tag The EXIF tag to retrieve.
+     * @return The EXIF entry corresponding to the tag.
+     */
     ExifEntry_t getExifTag(const ExifTagName tag) const;
-    virtual bool setSource( const String& filename );
-    virtual bool setSource( const Mat& buf );
-    virtual int setScale( const int& scale_denom );
-    virtual bool readHeader() = 0;
-    virtual bool readData( Mat& img ) = 0;
 
+    /**
+     * @brief Set the image source from a file.
+     * @param filename The name of the file to load the image from.
+     * @return true if the source was successfully set, false otherwise.
+     */
+    virtual bool setSource(const String& filename);
+
+    /**
+     * @brief Set the image source from a memory buffer.
+     * @param buf The buffer containing the image data.
+     * @return true if the source was successfully set, false otherwise.
+     */
+    virtual bool setSource(const Mat& buf);
+
+    /**
+     * @brief Set the scale factor for the image.
+     * @param scale_denom The denominator of the scale factor (image is scaled down by 1/scale_denom).
+     * @return The scale factor that was set.
+     */
+    virtual int setScale(const int& scale_denom);
+
+    /**
+     * @brief Read the image header to extract basic properties (width, height, type).
+     * This is a pure virtual function that must be implemented by derived classes.
+     * @return true if the header was successfully read, false otherwise.
+     */
+    virtual bool readHeader() = 0;
+
+    /**
+     * @brief Read the image data into a Mat object.
+     * This is a pure virtual function that must be implemented by derived classes.
+     * @param img The Mat object where the image data will be stored.
+     * @return true if the data was successfully read, false otherwise.
+     */
+    virtual bool readData(Mat& img) = 0;
+
+    /**
+     * @brief Set whether to decode the image in RGB order instead of the default BGR.
+     * @param useRGB If true, the image will be decoded in RGB order.
+     */
     virtual void setRGB(bool useRGB);
 
-    /// Called after readData to advance to the next page, if any.
+    /**
+     * @brief Advance to the next page or frame of the image, if applicable.
+     * The default implementation does nothing and returns false.
+     * @return true if there is another page/frame, false otherwise.
+     */
     virtual bool nextPage() { return false; }
 
+    /**
+     * @brief Get the length of the format signature used to identify the image format.
+     * @return The length of the signature.
+     */
     virtual size_t signatureLength() const;
-    virtual bool checkSignature( const String& signature ) const;
+
+    /**
+     * @brief Check if the provided signature matches the expected format signature.
+     * @param signature The signature to check.
+     * @return true if the signature matches, false otherwise.
+     */
+    virtual bool checkSignature(const String& signature) const;
+
+    /**
+     * @brief Create and return a new instance of the derived image decoder.
+     * @return A new ImageDecoder object.
+     */
     virtual ImageDecoder newDecoder() const;
 
 protected:
-    int  m_width;  // width  of the image ( filled by readHeader )
-    int  m_height; // height of the image ( filled by readHeader )
-    int  m_type;
-    int  m_scale_denom;
-    String m_filename;
-    String m_signature;
-    Mat m_buf;
-    bool m_buf_supported;
-    bool m_use_rgb;       // flag of decode image as RGB order instead of BGR.
-    ExifReader m_exif;
-    size_t  m_frame_count;
+    int m_width;          ///< Width of the image (set by readHeader).
+    int m_height;         ///< Height of the image (set by readHeader).
+    int m_type;           ///< Image type (e.g., color depth, channel order).
+    int m_scale_denom;    ///< Scale factor denominator for resizing the image.
+    String m_filename;    ///< Name of the file that is being decoded.
+    String m_signature;   ///< Signature for identifying the image format.
+    Mat m_buf;            ///< Buffer holding the image data when loaded from memory.
+    bool m_buf_supported; ///< Flag indicating whether buffer-based loading is supported.
+    bool m_use_rgb;       ///< Flag indicating whether to decode the image in RGB order.
+    ExifReader m_exif;    ///< Object for reading EXIF metadata from the image.
+    size_t m_frame_count; ///< Number of frames in the image (for animations and multi-page images).
 };
 
 
-///////////////////////////// base class for encoders ////////////////////////////
-class BaseImageEncoder
-{
+/**
+ * @brief Base class for image encoders.
+ *
+ * The BaseImageEncoder class provides an abstract interface for encoding images in various formats.
+ * It defines common functionality like setting the destination (file or memory buffer), checking if
+ * the format supports a specific image depth, and writing image data. Derived classes must implement
+ * methods like writing the image data to handle format-specific encoding logic.
+ */
+class BaseImageEncoder {
 public:
+    /**
+     * @brief Constructor for BaseImageEncoder.
+     * Initializes the object and sets default values for member variables.
+     */
     BaseImageEncoder();
-    virtual ~BaseImageEncoder() {}
-    virtual bool isFormatSupported( int depth ) const;
 
-    virtual bool setDestination( const String& filename );
-    virtual bool setDestination( std::vector<uchar>& buf );
-    virtual bool write( const Mat& img, const std::vector<int>& params ) = 0;
+    /**
+     * @brief Virtual destructor for BaseImageEncoder.
+     * Ensures proper cleanup of derived classes when deleted via a pointer to BaseImageEncoder.
+     */
+    virtual ~BaseImageEncoder() {}
+
+    /**
+     * @brief Checks if the image format supports a specific image depth.
+     * @param depth The depth (bit depth) of the image.
+     * @return true if the format supports the specified depth, false otherwise.
+     */
+    virtual bool isFormatSupported(int depth) const;
+
+    /**
+     * @brief Set the destination for encoding as a file.
+     * @param filename The name of the file to which the image will be written.
+     * @return true if the destination was successfully set, false otherwise.
+     */
+    virtual bool setDestination(const String& filename);
+
+    /**
+     * @brief Set the destination for encoding as a memory buffer.
+     * @param buf A reference to the buffer where the encoded image data will be stored.
+     * @return true if the destination was successfully set, false otherwise.
+     */
+    virtual bool setDestination(std::vector<uchar>& buf);
+
+    /**
+     * @brief Encode and write the image data.
+     * This is a pure virtual function that must be implemented by derived classes.
+     * @param img The Mat object containing the image data to be encoded.
+     * @param params A vector of parameters controlling the encoding process (e.g., compression level).
+     * @return true if the image was successfully written, false otherwise.
+     */
+    virtual bool write(const Mat& img, const std::vector<int>& params) = 0;
+
+    /**
+     * @brief Encode and write multiple images (e.g., for animated formats).
+     * By default, this method returns false, indicating that the format does not support multi-image encoding.
+     * @param img_vec A vector of Mat objects containing the images to be encoded.
+     * @param params A vector of parameters controlling the encoding process.
+     * @return true if multiple images were successfully written, false otherwise.
+     */
     virtual bool writemulti(const std::vector<Mat>& img_vec, const std::vector<int>& params);
 
+    /**
+     * @brief Get a description of the image encoder (e.g., the format it supports).
+     * @return A string describing the encoder.
+     */
     virtual String getDescription() const;
+
+    /**
+     * @brief Create and return a new instance of the derived image encoder.
+     * @return A new ImageEncoder object.
+     */
     virtual ImageEncoder newEncoder() const;
 
+    /**
+     * @brief Throw an exception based on the last error encountered during encoding.
+     * This method can be used to propagate error conditions back to the caller.
+     */
     virtual void throwOnEror() const;
 
 protected:
-    String m_description;
-
-    String m_filename;
-    std::vector<uchar>* m_buf;
-    bool m_buf_supported;
-
-    String m_last_error;
+    String m_description;    ///< Description of the encoder (e.g., format name, capabilities).
+    String m_filename;       ///< Destination file name for encoded data.
+    std::vector<uchar>* m_buf; ///< Pointer to the buffer for encoded data if using memory-based destination.
+    bool m_buf_supported;    ///< Flag indicating whether buffer-based encoding is supported.
+    String m_last_error;     ///< Stores the last error message encountered during encoding.
 };
 
 }
diff --git a/modules/imgcodecs/src/grfmt_pfm.cpp b/modules/imgcodecs/src/grfmt_pfm.cpp
index 61cab06714..baa0108081 100644
--- a/modules/imgcodecs/src/grfmt_pfm.cpp
+++ b/modules/imgcodecs/src/grfmt_pfm.cpp
@@ -81,20 +81,17 @@ PFMDecoder::~PFMDecoder()
 
 PFMDecoder::PFMDecoder() : m_scale_factor(0), m_swap_byte_order(false)
 {
-  m_strm.close();
+  m_buf_supported = true;
 }
 
 bool PFMDecoder::readHeader()
 {
-  if (m_buf.empty()) {
-    if (!m_strm.open(m_filename)) {
-      return false;
-    }
-  } else {
-    if (!m_strm.open(m_buf)) {
-      return false;
-    }
-  }
+  if (!m_buf.empty())
+    m_strm.open(m_buf);
+  else
+    m_strm.open(m_filename);
+
+  if( !m_strm.isOpened()) return false;
 
   if (m_strm.getByte() != 'P') {
     CV_Error(Error::StsError, "Unexpected file type (expected P)");
@@ -177,6 +174,7 @@ void PFMDecoder::close()
 PFMEncoder::PFMEncoder()
 {
   m_description = "Portable image format - float (*.pfm)";
+  m_buf_supported = true;
 }
 
 PFMEncoder::~PFMEncoder()
diff --git a/modules/imgcodecs/src/grfmt_sunras.cpp b/modules/imgcodecs/src/grfmt_sunras.cpp
index 852e735477..f2878d1760 100644
--- a/modules/imgcodecs/src/grfmt_sunras.cpp
+++ b/modules/imgcodecs/src/grfmt_sunras.cpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
 #include "grfmt_sunras.hpp"
@@ -60,6 +22,7 @@ SunRasterDecoder::SunRasterDecoder()
     m_encoding = RAS_STANDARD;
     m_maptype = RMT_NONE;
     m_maplength = 0;
+    m_buf_supported = true;
 }
 
 
@@ -82,7 +45,12 @@ bool  SunRasterDecoder::readHeader()
 {
     bool result = false;
 
-    if( !m_strm.open( m_filename )) return false;
+    if (!m_buf.empty())
+        m_strm.open(m_buf);
+    else
+        m_strm.open(m_filename);
+
+    if( !m_strm.isOpened()) return false;
 
     try
     {
@@ -389,6 +357,7 @@ bad_decoding_end:
 SunRasterEncoder::SunRasterEncoder()
 {
     m_description = "Sun raster files (*.sr;*.ras)";
+    m_buf_supported = true;
 }
 
 
@@ -408,7 +377,18 @@ bool  SunRasterEncoder::write( const Mat& img, const std::vector<int>& )
     int fileStep = (width*channels + 1) & -2;
     WMByteStream  strm;
 
-    if( strm.open(m_filename) )
+    if (m_buf) {
+        if (!strm.open(*m_buf)) {
+            return false;
+        }
+        else {
+            m_buf->reserve(height * fileStep + 32);
+        }
+    }
+    else
+        strm.open(m_filename);
+
+    if( strm.isOpened() )
     {
         CHECK_WRITE(strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) ));
         CHECK_WRITE(strm.putDWord( width ));
diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp
index 68678599b2..0d8a718756 100644
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@@ -336,6 +336,7 @@ TEST_P(Imgcodecs_Avif_Animation_WriteDecodeSuite, encode_decode) {
   file.seekg(0, std::ios::beg);
   std::vector<unsigned char> buf(size);
   EXPECT_TRUE(file.read(reinterpret_cast<char*>(buf.data()), size));
+  file.close();
   EXPECT_EQ(0, remove(output.c_str()));
   std::vector<cv::Mat> anim;
   ASSERT_TRUE(cv::imdecodemulti(buf, imread_mode_, anim));
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 1b8b85a04b..71813d320d 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -2617,8 +2617,8 @@ public:
                 v_uint32 r0, r1, r2, r3;
                 v_expand(vx_load(S0), r0, r1);
                 v_expand(vx_load(S1), r2, r3);
-                r0 += r2; r1 += r3;
-                v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
+                r0 = v_add(r0, r2); r1 = v_add(r1, r3);
+                v_rshr_pack_store<2>(D, v_add(r0, v_rotate_left<1>(r1, r0)));
             }
 #else
                 v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp
index 208ffc1231..b4aafeaea2 100644
--- a/modules/imgproc/src/sumpixels.simd.hpp
+++ b/modules/imgproc/src/sumpixels.simd.hpp
@@ -130,9 +130,9 @@ struct Integral_SIMD<uchar, int, double>
                     el8 = v_add(el8, v_rotate_left<1>(el8));
                     el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
@@ -188,11 +188,11 @@ struct Integral_SIMD<uchar, int, double>
                     el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
                     el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
 #endif
 #endif
                     v_expand(el8_1, el4l_1, el4h_1);
@@ -350,9 +350,9 @@ struct Integral_SIMD<uchar, int, double>
                     prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31);
 #else
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
@@ -364,7 +364,7 @@ struct Integral_SIMD<uchar, int, double>
                     prev = v_combine_high(el4h, el4h);
 #else
                     v_int32 t = v_rotate_right<12>(el4h);
-                    t |= v_rotate_left<4>(t);
+                    t = v_or(t, v_rotate_left<4>(t));
                     prev = v_combine_low(t, t);
 #endif
 #endif
@@ -442,9 +442,9 @@ struct Integral_SIMD<uchar, float, double>
                     el8 = v_add(el8, v_rotate_left<1>(el8));
                     el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_int32 el4li, el4hi;
@@ -501,11 +501,11 @@ struct Integral_SIMD<uchar, float, double>
                     el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
                     el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
 #endif
 #endif
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
@@ -590,13 +590,13 @@ struct Integral_SIMD<uchar, float, double>
                     el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
                     el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
-                    el8_3 += v_rotate_left<4>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
-                    el8_3 += v_rotate_left<8>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
 #endif
 #endif
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
@@ -663,9 +663,9 @@ struct Integral_SIMD<uchar, float, double>
                     prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31);
 #else
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_int32 el4li, el4hi;
@@ -678,7 +678,7 @@ struct Integral_SIMD<uchar, float, double>
                     prev = v_combine_high(el4h, el4h);
 #else
                     v_float32 t = v_rotate_right<12>(el4h);
-                    t |= v_rotate_left<4>(t);
+                    t = v_or(t, v_rotate_left<4>(t));
                     prev = v_combine_low(t, t);
 #endif
 #endif
@@ -770,9 +770,9 @@ struct Integral_SIMD<uchar, double, double>
                     el8 = v_add(el8, v_rotate_left<1>(el8));
                     el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_int32 el4li, el4hi;
@@ -843,11 +843,11 @@ struct Integral_SIMD<uchar, double, double>
                     el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
                     el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
 #endif
 #endif
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
@@ -958,13 +958,13 @@ struct Integral_SIMD<uchar, double, double>
                     el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
                     el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
-                    el8_3 += v_rotate_left<4>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
-                    el8_3 += v_rotate_left<8>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
 #endif
 #endif
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
@@ -1058,9 +1058,9 @@ struct Integral_SIMD<uchar, double, double>
                     prev_1.val = prev_2.val = el4hh.val;
 #else
 #if CV_SIMD_WIDTH >= 32
-                    el8 += v_rotate_left<4>(el8);
+                    el8 = v_add(el8, v_rotate_left<4>(el8));
 #if CV_SIMD_WIDTH == 64
-                    el8 += v_rotate_left<8>(el8);
+                    el8 = v_add(el8, v_rotate_left<8>(el8));
 #endif
 #endif
                     v_int32 el4li, el4hi;
diff --git a/modules/ts/include/opencv2/ts/ocl_perf.hpp b/modules/ts/include/opencv2/ts/ocl_perf.hpp
index aa87243a4f..89b224147a 100644
--- a/modules/ts/include/opencv2/ts/ocl_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_perf.hpp
@@ -64,7 +64,7 @@ using namespace perf;
     public: \
         OCL##_##fixture##_##name() { } \
     protected: \
-        virtual void PerfTestBody(); \
+        virtual void PerfTestBody() CV_OVERRIDE; \
     }; \
     TEST_F(OCL##_##fixture##_##name, name) { CV_TRACE_REGION("PERF_TEST: " #fixture "_" #name); declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \
     void OCL##_##fixture##_##name::PerfTestBody()
@@ -76,7 +76,7 @@ using namespace perf;
     public: \
         OCL##_##fixture##_##name() { } \
     protected: \
-        virtual void PerfTestBody(); \
+        virtual void PerfTestBody() CV_OVERRIDE; \
     }; \
     TEST_P(OCL##_##fixture##_##name, name) { CV_TRACE_REGION("PERF_TEST_P: " #fixture "_" #name); declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \
     INSTANTIATE_TEST_CASE_P(/*none*/, OCL##_##fixture##_##name, params); \
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index eebf4c594b..66e12d77d6 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -85,7 +85,7 @@ struct SkipThisTest : public ::testing::Test {
     };\
     class test_case_name##test_name##_factory : public ::testing::internal::TestFactoryBase { \
      public:\
-      virtual ::testing::Test* CreateTest() { \
+      virtual ::testing::Test* CreateTest() CV_OVERRIDE { \
         try { \
           return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name); \
         } catch (const cvtest::details::SkipTestExceptionBase& e) { \
@@ -150,7 +150,7 @@ struct SkipThisTest : public ::testing::Test {
     };\
     class test_fixture##test_name##_factory : public ::testing::internal::TestFactoryBase { \
      public:\
-      virtual ::testing::Test* CreateTest() { \
+      virtual ::testing::Test* CreateTest() CV_OVERRIDE { \
         try { \
           return new GTEST_TEST_CLASS_NAME_(test_fixture, test_name); \
         } catch (const cvtest::details::SkipTestExceptionBase& e) { \
diff --git a/modules/ts/include/opencv2/ts/ts_gtest.h b/modules/ts/include/opencv2/ts/ts_gtest.h
index 49eb3a5ec7..5cf6fc4537 100644
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
@@ -8458,7 +8458,7 @@ class TestFactoryBase {
 template <class TestClass>
 class TestFactoryImpl : public TestFactoryBase {
  public:
-  virtual Test* CreateTest() { return new TestClass; }
+  virtual Test* CreateTest() override { return new TestClass; }
 };
 
 #if GTEST_OS_WINDOWS
@@ -11927,7 +11927,7 @@ class ParameterizedTestFactory : public internal::TestFactoryBase {
   typedef typename TestClass::ParamType ParamType;
   explicit ParameterizedTestFactory(ParamType parameter) :
       parameter_(parameter) {}
-  virtual Test* CreateTest() {
+  virtual Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
   }
@@ -11968,7 +11968,7 @@ class TestMetaFactory
 
   TestMetaFactory() {}
 
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) override {
     return new ParameterizedTestFactory<TestCase>(parameter);
   }
 
@@ -12030,9 +12030,9 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
       : test_case_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  virtual const std::string& GetTestCaseName() const { return test_case_name_; }
+  virtual const std::string& GetTestCaseName() const override { return test_case_name_; }
   // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  virtual TypeId GetTestCaseTypeId() const override { return GetTypeId<TestCase>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
   // test_case_name is the base name of the test case (without invocation
@@ -12061,7 +12061,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   // This method should not be called more then once on any single
   // instance of a ParameterizedTestCaseInfoBase derived class.
   // UnitTest has a guard to prevent from calling this method more then once.
-  virtual void RegisterTests() {
+  virtual void RegisterTests() override {
     for (typename TestInfoContainer::iterator test_it = tests_.begin();
          test_it != tests_.end(); ++test_it) {
       linked_ptr<TestInfo> test_info = *test_it;
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index 4b59978f3c..c2ce7926bb 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -586,7 +586,7 @@ void PrintTo(const Size& sz, ::std::ostream* os);
       public:\
        fixture() {}\
       protected:\
-       virtual void PerfTestBody();\
+       virtual void PerfTestBody() CV_OVERRIDE;\
      };\
      TEST_F(fixture, testname){ CV__PERF_TEST_BODY_IMPL(#fixture "_" #testname); }\
     }\
@@ -627,7 +627,7 @@ void PrintTo(const Size& sz, ::std::ostream* os);
      public:\
       fixture##_##name() {}\
      protected:\
-      virtual void PerfTestBody();\
+      virtual void PerfTestBody() CV_OVERRIDE;\
     };\
     CV__TEST_P(fixture##_##name, name, PerfTestBodyDummy,, CV__PERF_TEST_BODY_IMPL){} \
     INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
diff --git a/modules/video/src/hal_replacement.hpp b/modules/video/src/hal_replacement.hpp
new file mode 100644
index 0000000000..8d10ab39d1
--- /dev/null
+++ b/modules/video/src/hal_replacement.hpp
@@ -0,0 +1,101 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEO_HAL_REPLACEMENT_HPP
+#define OPENCV_VIDEO_HAL_REPLACEMENT_HPP
+
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__clang__)  // clang or MSVC clang
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+//! @addtogroup video_hal_interface
+//! @note Define your functions to override default implementations:
+//! @code
+//! #undef cv_hal_LK_optical_flow_level
+//! #define cv_hal_LK_optical_flow_level my_hal_LK_optical_flow_level
+//! @endcode
+//! @{
+
+/**
+@brief Lucas-Kanade optical flow for single pyramid layer. See calcOpticalFlowPyrLK
+@param prev_data previous frame image data
+@param prev_data_step previous frame image data step
+@param prev_deriv_data previous frame Schaar derivatives
+@param prev_deriv_step previous frame Schaar derivatives step
+@param next_data next frame image data
+@param next_step next frame image step
+@param width input images width
+@param height input images height
+@param cn source image channels
+@param prev_points 2d points coordinates (x,y) on the previous frame
+@param next_points points coordinates (x,y) on the next frame
+@param point_count - amount of input points
+@param status optical flow status for each point. Optional output, expected if not nullptr is provided
+@param err optical flow estimation error for each point. Optional output, expected if not nullptr is provided
+@param win_width optical flow window width
+@param win_height optical flow window heigh
+@param termination_count maximum algorithm iterations. 0 means unlimited
+@param termination_epsilon maximal allowed algorithm error
+@param get_min_eigen_vals return minimal egen values as point errors in err buffer
+@param min_eigen_vals_threshold eigen values threshold
+**/
+inline int hal_ni_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
+                       const short* prev_deriv_data, size_t prev_deriv_step,
+                       const uchar* next_data, size_t next_step,
+                       int width, int height, int cn,
+                       const float *prev_points, float *next_points, size_t point_count,
+                       uchar *status, float *err,
+                       const int win_width, const int win_height,
+                       int termination_count, double termination_epsilon,
+                       bool get_min_eigen_vals,
+                       float min_eigen_vals_threshold)
+{
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+//! @cond IGNORED
+#define cv_hal_LKOpticalFlowLevel hal_ni_LKOpticalFlowLevel
+//! @endcond
+
+//! @}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#include "custom_hal.hpp"
+
+//! @cond IGNORED
+#define CALL_HAL_RET(name, fun, retval, ...) \
+    int res = __CV_EXPAND(fun(__VA_ARGS__, &retval)); \
+    if (res == CV_HAL_ERROR_OK) \
+        return retval; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+
+
+#define CALL_HAL(name, fun, ...) \
+    int res = __CV_EXPAND(fun(__VA_ARGS__)); \
+    if (res == CV_HAL_ERROR_OK) \
+        return; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+//! @endcond
+
+#endif
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 662ac13235..03de93ee08 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -49,6 +49,8 @@
 #include "opencv2/3d.hpp"
 #endif
 
+#include "hal_replacement.hpp"
+
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 
 namespace
@@ -182,11 +184,17 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 {
     CV_INSTRUMENT_REGION();
 
+    const int W_BITS = 14, W_BITS1 = 14;
+    const float FLT_SCALE = 1.f/(1 << 20);
+
     Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
     const Mat& I = *prevImg;
     const Mat& J = *nextImg;
     const Mat& derivI = *prevDeriv;
 
+    cv::AutoBuffer<Point2f> prevPtsScaledData(range.end - range.start);
+    Point2f* prevPtsScaled = prevPtsScaledData.data();
+
     int j, cn = I.channels(), cn2 = cn*2;
     cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
     int derivDepth = DataType<deriv_type>::depth;
@@ -208,7 +216,23 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
         else
             nextPt = nextPts[ptidx]*2.f;
         nextPts[ptidx] = nextPt;
+        prevPtsScaled[ptidx-range.start] = prevPt;
+    }
 
+    CALL_HAL(LKOpticalFlowLevel, cv_hal_LKOpticalFlowLevel,
+        I.data, I.step, (const short*)derivI.data, derivI.step, J.data, J.step,
+        I.cols, I.rows, I.channels(),
+        (float*)prevPtsScaled, (float*)(nextPts+range.start), range.end-range.start,
+        (level == 0) ? status+range.start: nullptr,
+        err != nullptr ? err+range.start: nullptr,
+        winSize.width, winSize.height, criteria.maxCount, criteria.epsilon,
+        (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0,
+        (float)minEigThreshold
+    );
+
+    for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+    {
+        Point2f prevPt = prevPtsScaled[ptidx-range.start];
         Point2i iprevPt, inextPt;
         prevPt -= halfWin;
         iprevPt.x = cvFloor(prevPt.x);
@@ -219,8 +243,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
         {
             if( level == 0 )
             {
-                if( status )
-                    status[ptidx] = false;
+                status[ptidx] = false;
                 if( err )
                     err[ptidx] = 0;
             }
@@ -229,8 +252,6 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 
         float a = prevPt.x - iprevPt.x;
         float b = prevPt.y - iprevPt.y;
-        const int W_BITS = 14, W_BITS1 = 14;
-        const float FLT_SCALE = 1.f/(1 << 20);
         int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
         int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
         int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
@@ -477,14 +498,14 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 
         if( minEig < minEigThreshold || D < FLT_EPSILON )
         {
-            if( level == 0 && status )
+            if(level == 0)
                 status[ptidx] = false;
             continue;
         }
 
         D = 1.f/D;
 
-        nextPt -= halfWin;
+        Point2f nextPt = nextPts[ptidx] - halfWin;
         Point2f prevDelta;
 
         for( j = 0; j < criteria.maxCount; j++ )
@@ -495,7 +516,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
             if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
                inextPt.y < -winSize.height || inextPt.y >= J.rows )
             {
-                if( level == 0 && status )
+                if( level == 0 )
                     status[ptidx] = false;
                 break;
             }
@@ -678,7 +699,6 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
             prevDelta = delta;
         }
 
-        CV_Assert(status != NULL);
         if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 )
         {
             Point2f nextPoint = nextPts[ptidx] - halfWin;
@@ -690,8 +710,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
             if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
                 inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
             {
-                if( status )
-                    status[ptidx] = false;
+                status[ptidx] = false;
                 continue;
             }
 
diff --git a/platforms/linux/arm.toolchain.cmake b/platforms/linux/arm.toolchain.cmake
index 184997fba5..ddbad83e51 100644
--- a/platforms/linux/arm.toolchain.cmake
+++ b/platforms/linux/arm.toolchain.cmake
@@ -48,32 +48,23 @@ if(NOT DEFINED ARM_LINUX_SYSROOT AND DEFINED GNU_MACHINE)
   set(ARM_LINUX_SYSROOT /usr/${GNU_MACHINE}${FLOAT_ABI_SUFFIX})
 endif()
 
-if(NOT DEFINED CMAKE_CXX_FLAGS)
-  set(CMAKE_CXX_FLAGS           "" CACHE INTERNAL "")
-  set(CMAKE_C_FLAGS             "" CACHE INTERNAL "")
-  set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "")
-  set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "")
-  set(CMAKE_EXE_LINKER_FLAGS    "" CACHE INTERNAL "")
-
-  set(CMAKE_CXX_FLAGS           "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
-  set(CMAKE_C_FLAGS             "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
-  if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
-    set(CMAKE_CXX_FLAGS           "-mthumb ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS             "-mthumb ${CMAKE_C_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,nocopyreloc")
+# == Compiler flags
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
+  set(CMAKE_CXX_FLAGS_INIT "-mthumb")
+  set(CMAKE_C_FLAGS_INIT   "-mthumb")
+  set(common_ld_opt "-Wl,--fix-cortex-a8")
+  set(CMAKE_SHARED_LINKER_FLAGS_INIT "${common_ld_opt}")
+  set(CMAKE_MODULE_LINKER_FLAGS_INIT "${common_ld_opt}")
+  set(CMAKE_EXE_LINKER_FLAGS_INIT    "${common_ld_opt} -Wl,-z,nocopyreloc")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+  include("${CMAKE_CURRENT_LIST_DIR}/flags-aarch64.cmake")
+  if(COMMAND ocv_set_platform_flags)
+    ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT)
+    ocv_set_platform_flags(CMAKE_C_FLAGS_INIT)
   endif()
-  if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
-    set(ARM_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
-  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
-    set(ARM_LINKER_FLAGS "-Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
-  endif()
-  set(CMAKE_SHARED_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
-  set(CMAKE_MODULE_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
-  set(CMAKE_EXE_LINKER_FLAGS    "${ARM_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
-else()
-  #message(WARNING "CMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}' is defined")
 endif()
 
+
 if(USE_NEON)
   message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
   set(ENABLE_NEON TRUE)
diff --git a/platforms/linux/flags-aarch64.cmake b/platforms/linux/flags-aarch64.cmake
new file mode 100644
index 0000000000..5aeb7a2b6a
--- /dev/null
+++ b/platforms/linux/flags-aarch64.cmake
@@ -0,0 +1,19 @@
+# see https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-march
+function(ocv_set_platform_flags VAR)
+  unset(flags)
+  if(ENABLE_BF16)
+    set(flags "${flags}+bf16")
+  endif()
+  if(ENABLE_DOTPROD)
+    set(flags "${flags}+dotprod")
+  endif()
+  if(ENABLE_FP16)
+    set(flags "${flags}+fp16")
+  endif()
+  if(DEFINED ENABLE_NEON AND NOT ENABLE_NEON)
+    set(flags "${flags}+nosimd")
+  endif()
+  if(flags)
+    set(${VAR} "-march=armv8.2-a${flags}" PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/platforms/linux/flags-riscv64.cmake b/platforms/linux/flags-riscv64.cmake
new file mode 100644
index 0000000000..4488cf5887
--- /dev/null
+++ b/platforms/linux/flags-riscv64.cmake
@@ -0,0 +1,9 @@
+# see https://gcc.gnu.org/onlinedocs/gcc/RISC-V-Options.html#index-march-14
+function(ocv_set_platform_flags VAR)
+  if(ENABLE_RVV OR RISCV_RVV_SCALABLE)
+    set(flags "-march=rv64gcv")
+  else()
+    set(flags "-march=rv64gc")
+  endif()
+  set(${VAR} "${flags}" PARENT_SCOPE)
+endfunction()
diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
index 9b9c0b5246..a18c3df9e1 100755
--- a/platforms/linux/riscv64-andes-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
@@ -10,16 +10,12 @@ set(CMAKE_C_COMPILER  ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
 set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
 
 # fix toolchain macro
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ANDES=1")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ANDES=1")
-
 # enable rvp
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")
+set(CMAKE_C_FLAGS_INIT "-march=rv64gc -mext-dsp -D__ANDES=1")
+set(CMAKE_CXX_FLAGS_INIT "-march=rv64gc -mext-dsp -D__ANDES=1")
 
 # fix segment address
 
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")
-set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-Wl,-Ttext-segment=0x50000")
+set(CMAKE_SHARED_LINKER_FLAGS_INIT "-Wl,-Ttext-segment=0x50000")
diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake
index 612be05eab..939350fcbd 100644
--- a/platforms/linux/riscv64-clang.toolchain.cmake
+++ b/platforms/linux/riscv64-clang.toolchain.cmake
@@ -17,8 +17,13 @@ set(CMAKE_ASM_COMPILER_TARGET ${CLANG_TARGET_TRIPLE})
 # Don't run the linker on compiler check
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 
-set(CMAKE_C_FLAGS "-march=rv64gc --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gc --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_CXX_FLAGS}")
+include("${CMAKE_CURRENT_LIST_DIR}/flags-riscv64.cmake")
+if(COMMAND ocv_set_platform_flags)
+  ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT)
+  ocv_set_platform_flags(CMAKE_C_FLAGS_INIT)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w")
+set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w")
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
diff --git a/platforms/linux/riscv64-gcc.toolchain.cmake b/platforms/linux/riscv64-gcc.toolchain.cmake
index c3a0e161e3..7a067d3f1a 100644
--- a/platforms/linux/riscv64-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-gcc.toolchain.cmake
@@ -1,10 +1,11 @@
 set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
 set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
 
-if(NOT DEFINED CMAKE_CXX_FLAGS)  # guards toolchain multiple calls
-  set(CMAKE_C_FLAGS "-march=rv64gc")
-  set(CMAKE_CXX_FLAGS "-march=rv64gc")
+include("${CMAKE_CURRENT_LIST_DIR}/flags-riscv64.cmake")
+if(COMMAND ocv_set_platform_flags)
+  ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT)
+  ocv_set_platform_flags(CMAKE_C_FLAGS_INIT)
 endif()
 
 include("${CMAKE_CURRENT_LIST_DIR}/riscv-gnu.toolchain.cmake")
diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml
index 99abf22b26..f2e9e98be2 100644
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@@ -74,7 +74,24 @@ yolov8l:
   rgb: true
   labels: "object_detection_classes_yolo.txt"
   postprocessing: "yolov8"
-  sample: "yolo_detector"
+  sample: "object_detection"
+
+# YOLOv5 object detection family from ultralytics (https://github.com/ultralytics/ultralytics)
+# Might be used for all YOLOv5n YOLOv5s YOLOv5m YOLOv5l and YOLOv5x
+
+yolov5l:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov5l.onnx"
+    sha1: "9de7e54c524b7fe7577bbd4cdbbdaed53375c8f1"
+  model: "yolov5l.onnx"
+  mean: 0.0
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "object_detection"
 
 # YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet)
 # YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/)
diff --git a/samples/python/tracker.py b/samples/python/tracker.py
index 9e6f939275..58a652a002 100644
--- a/samples/python/tracker.py
+++ b/samples/python/tracker.py
@@ -23,6 +23,10 @@ USAGE:
                     [--nanotrack_backbone NANOTRACK_BACKBONE]
                     [--nanotrack_headneck NANOTRACK_TARGET]
                     [--vittrack_net VITTRACK_MODEL]
+                    [--vittrack_net VITTRACK_MODEL]
+                    [--tracking_score_threshold TRACKING SCORE THRESHOLD FOR ONLY VITTRACK]
+                    [--backend CHOOSE ONE OF COMPUTATION BACKEND]
+                    [--target CHOOSE ONE OF COMPUTATION TARGET]
 '''
 
 # Python 2/3 compatibility
@@ -36,6 +40,11 @@ import argparse
 
 from video import create_capture, presets
 
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
+           cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
+
 class App(object):
 
     def __init__(self, args):
@@ -51,15 +60,22 @@ class App(object):
             params.model = self.args.dasiamrpn_net
             params.kernel_cls1 = self.args.dasiamrpn_kernel_cls1
             params.kernel_r1 = self.args.dasiamrpn_kernel_r1
+            params.backend = args.backend
+            params.target = args.target
             tracker = cv.TrackerDaSiamRPN_create(params)
         elif self.trackerAlgorithm == 'nanotrack':
             params = cv.TrackerNano_Params()
             params.backbone = args.nanotrack_backbone
             params.neckhead = args.nanotrack_headneck
+            params.backend = args.backend
+            params.target = args.target
             tracker = cv.TrackerNano_create(params)
         elif self.trackerAlgorithm == 'vittrack':
             params = cv.TrackerVit_Params()
             params.net = args.vittrack_net
+            params.tracking_score_threshold = args.tracking_score_threshold
+            params.backend = args.backend
+            params.target = args.target
             tracker = cv.TrackerVit_create(params)
         else:
             sys.exit("Tracker {} is not recognized. Please use one of three available: mil, dasiamrpn, nanotrack.".format(self.trackerAlgorithm))
@@ -133,6 +149,24 @@ if __name__ == '__main__':
     parser.add_argument("--nanotrack_backbone", type=str, default="nanotrack_backbone_sim.onnx", help="Path to onnx model of NanoTrack backBone")
     parser.add_argument("--nanotrack_headneck", type=str, default="nanotrack_head_sim.onnx", help="Path to onnx model of NanoTrack headNeck")
     parser.add_argument("--vittrack_net", type=str, default="vitTracker.onnx", help="Path to onnx model of  vittrack")
+    parser.add_argument('--tracking_score_threshold', type=float,  help="Tracking score threshold. If a bbox of score >= 0.3, it is considered as found ")
+    parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
+                help="Choose one of computation backends: "
+                        "%d: automatically (by default), "
+                        "%d: Halide language (http://halide-lang.org/), "
+                        "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                        "%d: OpenCV implementation, "
+                        "%d: VKCOM, "
+                        "%d: CUDA"% backends)
+    parser.add_argument("--target", choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
+                help="Choose one of target computation devices: "
+                        '%d: CPU target (by default), '
+                        '%d: OpenCL, '
+                        '%d: OpenCL fp16 (half-float precision), '
+                        '%d: VPU, '
+                        '%d: VULKAN, '
+                        '%d: CUDA, '
+                        '%d: CUDA fp16 (half-float preprocess)'% targets)
 
     args = parser.parse_args()
     App(args).run()