Merge branch 'opencv:4.x' into simd-demosaicing

2025-01-20 15:59:24 +08:00 · 2025-01-16 18:02:59 +08:00 · 2025-01-16 18:02:59 +08:00 · 54d0610f50
commit 54d0610f50
parent 7ee4a3b2ce 1d701d1690
548 changed files with 33717 additions and 16044 deletions
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@ -31,6 +31,9 @@ jobs:
  Windows10-x64:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main

+  Windows10-x64-UWP:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-UWP.yaml@main
+
  Windows10-ARM64:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-ARM64.yaml@main

--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@ -1932,4 +1932,50 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc

 #endif // OPENCV_IMGPROC_HAL_INTERFACE_H

+// The optimized branch was developed for old armv7 processors
+#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
+inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
+                       const short* prev_deriv_data, size_t prev_deriv_step,
+                       const uchar* next_data, size_t next_step,
+                       int width, int height, int cn,
+                       const float *prev_points, float *next_points, size_t point_count,
+                       uchar *status, float *err,
+                       const int win_width, const int win_height,
+                       int termination_count, double termination_epsilon,
+                       bool get_min_eigen_vals,
+                       float min_eigen_vals_threshold)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn,
+        prev_data, prev_data_step, prev_deriv_data, prev_deriv_step,
+        next_data, next_step,
+        point_count, prev_points, next_points,
+        status, err, CAROTENE_NS::Size2D(win_width, win_height),
+        termination_count, termination_epsilon,
+        get_min_eigen_vals, min_eigen_vals_threshold);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_LKOpticalFlowLevel
+#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
+#endif // __ARM_ARCH=7
+
+#if 0 // OpenCV provides fater parallel implementation
+inline int TEGRA_ScharrDeriv(const uchar* src_data, size_t src_step,
+                      short* dst_data, size_t dst_step,
+                      int width, int height, int cn)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::ScharrDeriv(CAROTENE_NS::Size2D(width, height), cn, src_data, src_step, dst_data, dst_step);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_ScharrDeriv
+#define cv_hal_ScharrDeriv TEGRA_ScharrDeriv
+#endif
+
 #endif
--- a/3rdparty/carotene/include/carotene/functions.hpp
+++ b/3rdparty/carotene/include/carotene/functions.hpp
@ -2485,7 +2485,7 @@ namespace CAROTENE_NS {
                           u8 *status, f32 *err,
                           const Size2D &winSize,
                           u32 terminationCount, f64 terminationEpsilon,
-                           u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                           bool getMinEigenVals,
                           f32 minEigThreshold);
 }

--- a/3rdparty/carotene/src/opticalflow.cpp
+++ b/3rdparty/carotene/src/opticalflow.cpp
@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
                       u8 *status, f32 *err,
                       const Size2D &winSize,
                       u32 terminationCount, f64 terminationEpsilon,
-                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                       bool getMinEigenVals,
                       f32 minEigThreshold)
 {
    internal::assertSupportedConfiguration();
@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,

    for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
    {
-        f32 levscale = (1./(1 << level));
        u32 ptref = ptidx << 1;
-        f32 prevPtX = prevPts[ptref+0]*levscale;
-        f32 prevPtY = prevPts[ptref+1]*levscale;
-        f32 nextPtX;
-        f32 nextPtY;
-        if( level == maxLevel )
-        {
-            if( useInitialFlow )
-            {
-                nextPtX = nextPts[ptref+0]*levscale;
-                nextPtY = nextPts[ptref+1]*levscale;
-            }
-            else
-            {
-                nextPtX = prevPtX;
-                nextPtY = prevPtY;
-            }
-        }
-        else
-        {
-            nextPtX = nextPts[ptref+0]*2.f;
-            nextPtY = nextPts[ptref+1]*2.f;
-        }
-        nextPts[ptref+0] = nextPtX;
-        nextPts[ptref+1] = nextPtY;
+        f32 prevPtX = prevPts[ptref+0];
+        f32 prevPtY = prevPts[ptref+1];
+        f32 nextPtX = nextPts[ptref+0];
+        f32 nextPtY = nextPts[ptref+1];

        s32 iprevPtX, iprevPtY;
        s32 inextPtX, inextPtY;
@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
        if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
            iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
        {
-            if( level == 0 )
-            {
-                if( status )
-                    status[ptidx] = false;
-                if( err )
-                    err[ptidx] = 0;
-            }
+            if( status )
+                status[ptidx] = false;
+            if( err )
+                err[ptidx] = 0;
            continue;
        }

@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,

        if( minEig < minEigThreshold || D < FLT_EPSILON )
        {
-            if( level == 0 && status )
+            if( status )
                status[ptidx] = false;
            continue;
        }
@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
            if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
               inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
            {
-                if( level == 0 && status )
+                if( status )
                    status[ptidx] = false;
                break;
            }
@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
            prevDeltaX = deltaX;
            prevDeltaY = deltaY;
        }
-
-        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
+        if( status && status[ptidx] && err && !getMinEigenVals )
        {
            f32 nextPointX = nextPts[ptref+0] - halfWinX;
            f32 nextPointY = nextPts[ptref+1] - halfWinY;
@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
    (void)winSize;
    (void)terminationCount;
    (void)terminationEpsilon;
-    (void)level;
-    (void)maxLevel;
-    (void)useInitialFlow;
    (void)getMinEigenVals;
    (void)minEigThreshold;
    (void)ptCount;
@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
 }

 }//CAROTENE_NS
-
--- a/3rdparty/fastcv/CMakeLists.txt
+++ b/3rdparty/fastcv/CMakeLists.txt
@ -0,0 +1,32 @@
+if(HAVE_FASTCV)
+  set(FASTCV_HAL_VERSION 0.0.1 CACHE INTERNAL "")
+  set(FASTCV_HAL_LIBRARIES "fastcv_hal" CACHE INTERNAL "")
+  set(FASTCV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" CACHE INTERNAL "")
+  set(FASTCV_HAL_HEADERS
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_core.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_imgproc.hpp"
+    CACHE INTERNAL "")
+
+  file(GLOB FASTCV_HAL_FILES    "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+  add_library(fastcv_hal STATIC ${FASTCV_HAL_FILES})
+
+  target_include_directories(fastcv_hal PRIVATE
+    ${CMAKE_SOURCE_DIR}/modules/core/include
+    ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+    ${FASTCV_HAL_INCLUDE_DIRS} ${FastCV_INCLUDE_PATH})
+
+  target_link_libraries(fastcv_hal PUBLIC ${FASTCV_LIBRARY})
+
+  set_target_properties(fastcv_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+
+  if(NOT BUILD_SHARED_LIBS)
+    ocv_install_target(fastcv_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  endif()
+
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(fastcv_hal PROPERTIES FOLDER "3rdparty")
+  endif()
+else()
+  message(STATUS "FastCV is not available, disabling related HAL")
+endif(HAVE_FASTCV)
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@ -0,0 +1,43 @@
+function(download_fastcv root_dir)
+
+  # Commit SHA in the opencv_3rdparty repo
+  set(FASTCV_COMMIT "dc5d58018f3af915a8d209386d2c58c0501c0f2c")
+
+  # Define actual FastCV versions
+  if(ANDROID)
+    if(AARCH64)
+      message(STATUS "Download FastCV for Android aarch64")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "9dac41e86597305f846212dae31a4a88")
+    else()
+      message(STATUS "Download FastCV for Android armv7")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "fe2d30334180b17e3031eee92aac43b6")
+    endif()
+  elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
+    if(AARCH64)
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "7b33ad833e6f15ab6d4ec64fa3c17acd")
+    else()
+      message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
+    endif()
+  endif(ANDROID)
+
+  # Download Package
+  set(OPENCV_FASTCV_URL "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FASTCV_COMMIT}/fastcv/")
+
+  ocv_download( FILENAME        ${FCV_PACKAGE_NAME}
+                HASH            ${FCV_PACKAGE_HASH}
+                URL             ${OPENCV_FASTCV_URL}
+                DESTINATION_DIR ${root_dir}
+                ID              FASTCV
+                STATUS          res
+                UNPACK
+                RELATIVE_URL)
+  if(res)
+    set(HAVE_FASTCV TRUE CACHE BOOL "FastCV status")
+  else()
+    message(WARNING "FastCV: package download failed!")
+  endif()
+
+endfunction()
--- a/3rdparty/fastcv/include/fastcv_hal_core.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_core.hpp
@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
+
+#include <opencv2/core/base.hpp>
+
+#undef  cv_hal_lut
+#define cv_hal_lut                  fastcv_hal_lut
+#undef  cv_hal_normHammingDiff8u
+#define cv_hal_normHammingDiff8u    fastcv_hal_normHammingDiff8u
+#undef  cv_hal_mul8u16u
+#define cv_hal_mul8u16u             fastcv_hal_mul8u16u
+#undef  cv_hal_sub8u32f
+#define cv_hal_sub8u32f             fastcv_hal_sub8u32f
+#undef  cv_hal_transpose2d
+#define cv_hal_transpose2d          fastcv_hal_transpose2d
+#undef  cv_hal_meanStdDev
+#define cv_hal_meanStdDev           fastcv_hal_meanStdDev
+#undef  cv_hal_flip
+#define cv_hal_flip                 fastcv_hal_flip
+#undef  cv_hal_rotate90
+#define cv_hal_rotate90             fastcv_hal_rotate
+#undef  cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u        fastcv_hal_addWeighted8u
+#undef  cv_hal_mul8u
+#define cv_hal_mul8u                fastcv_hal_mul8u
+#undef  cv_hal_mul16s
+#define cv_hal_mul16s               fastcv_hal_mul16s
+#undef  cv_hal_mul32f
+#define cv_hal_mul32f               fastcv_hal_mul32f
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief look-up table transform of an array.
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param src_type Source image type
+/// @param lut_data Pointer to lookup table
+/// @param lut_channel_size Size of each channel in bytes
+/// @param lut_channels Number of channels in lookup table
+/// @param dst_data Destination data
+/// @param dst_step Destination step
+/// @param width Width of images
+/// @param height Height of images
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_lut(
+    const uchar*    src_data,
+    size_t          src_step,
+    size_t          src_type,
+    const uchar*    lut_data,
+    size_t          lut_channel_size,
+    size_t          lut_channels,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Hamming distance between two vectors
+/// @param a pointer to first vector data
+/// @param b pointer to second vector data
+/// @param n length of vectors
+/// @param cellSize how many bits of the vectors will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
+/// @param result pointer to result output
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul8u16u(
+    const uchar    * src1_data,
+    size_t           src1_step,
+    const uchar    * src2_data,
+    size_t           src2_step,
+    ushort         * dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    double           scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_sub8u32f(
+    const uchar     *src1_data,
+    size_t           src1_step,
+    const uchar     *src2_data,
+    size_t           src2_step,
+    float           *dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_transpose2d(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              src_width,
+    int              src_height,
+    int              element_size);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_meanStdDev(
+    const uchar     * src_data,
+    size_t            src_step,
+    int               width,
+    int               height,
+    int               src_type,
+    double          * mean_val,
+    double          * stddev_val,
+    uchar           * mask,
+    size_t            mask_step);
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Flips a 2D array around vertical, horizontal, or both axes
+/// @param src_type source and destination image type
+/// @param src_data source image data
+/// @param src_step source image step
+/// @param src_width source and destination image width
+/// @param src_height source and destination image height
+/// @param dst_data destination image data
+/// @param dst_step destination image step
+/// @param flip_mode 0 flips around x-axis, 1 around y-axis, -1 both
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_flip(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             flip_mode);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Rotates a 2D array in multiples of 90 degrees.
+/// @param src_type source and destination image type
+/// @param src_data source image data
+/// @param src_step source image step
+/// @param src_width source image width
+///   @If angle has value [180] it is also destination image width
+///   If angle has values [90, 270] it is also destination image height
+/// @param src_height source and destination image height (destination image width for angles [90, 270])
+///   If angle has value [180] it is also destination image height
+///   If angle has values [90, 270] it is also destination image width
+/// @param dst_data destination image data
+/// @param dst_step destination image step
+/// @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_rotate(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             angle);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief weighted sum of two arrays using formula: dst[i] = a * src1[i] + b * src2[i]
+/// @param src1_data first source image data
+/// @param src1_step first source image step
+/// @param src2_data second source image data
+/// @param src2_step second source image step
+/// @param dst_data  destination image data
+/// @param dst_step  destination image step
+/// @param width     width of the images
+/// @param height    height of the images
+/// @param scalars   numbers a, b, and c
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3]);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul8u(
+    const uchar     *src1_data,
+    size_t          src1_step,
+    const uchar     *src2_data,
+    size_t          src2_step,
+    uchar           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul16s(
+    const short     *src1_data,
+    size_t          src1_step,
+    const short     *src2_data,
+    size_t          src2_step,
+    short           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul32f(
+    const float    *src1_data,
+    size_t          src1_step,
+    const float    *src2_data,
+    size_t          src2_step,
+    float          *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+#endif
--- a/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
+
+#include <opencv2/core/base.hpp>
+
+#undef  cv_hal_medianBlur
+#define cv_hal_medianBlur           fastcv_hal_medianBlur
+#undef  cv_hal_sobel
+#define cv_hal_sobel                fastcv_hal_sobel
+#undef  cv_hal_boxFilter
+#define cv_hal_boxFilter            fastcv_hal_boxFilter
+#undef  cv_hal_adaptiveThreshold
+#define cv_hal_adaptiveThreshold    fastcv_hal_adaptiveThreshold
+#undef  cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial fastcv_hal_gaussianBlurBinomial
+#undef  cv_hal_warpPerspective
+#define cv_hal_warpPerspective      fastcv_hal_warpPerspective
+#undef  cv_hal_pyrdown
+#define cv_hal_pyrdown              fastcv_hal_pyrdown
+#undef  cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV          fastcv_hal_cvtBGRtoHSV
+#undef  cv_hal_cvtBGRtoYUVApprox
+#define cv_hal_cvtBGRtoYUVApprox    fastcv_hal_cvtBGRtoYUVApprox
+#undef  cv_hal_canny
+#define cv_hal_canny                fastcv_hal_canny
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Calculate medianBlur filter
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param dst_data Destination image data
+/// @param dst_step Destination image step
+/// @param width    Source image width
+/// @param height   Source image height
+/// @param depth    Depths of source and destination image
+/// @param cn       Number of channels
+/// @param ksize    Size of kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_medianBlur(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    int             ksize);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Computes Sobel derivatives
+///
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param width            Source image width
+/// @param height           Source image height
+/// @param src_depth        Depth of source image
+/// @param dst_depth        Depths of destination image
+/// @param cn               Number of channels
+/// @param margin_left      Left margins for source image
+/// @param margin_top       Top margins for source image
+/// @param margin_right     Right margins for source image
+/// @param margin_bottom    Bottom margins for source image
+/// @param dx               orders of the derivative x
+/// @param dy               orders of the derivative y
+/// @param ksize            Size of kernel
+/// @param scale            Scale factor for the computed derivative values
+/// @param delta            Delta value that is added to the results prior to storing them in dst
+/// @param border_type      Border type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_sobel(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             src_depth,
+    int             dst_depth,
+    int             cn,
+    int             margin_left,
+    int             margin_top,
+    int             margin_right,
+    int             margin_bottom,
+    int             dx,
+    int             dy,
+    int             ksize,
+    double          scale,
+    double          delta,
+    int             border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int fastcv_hal_boxFilter(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    int              src_depth,
+    int              dst_depth,
+    int              cn,
+    int              margin_left,
+    int              margin_top,
+    int              margin_right,
+    int              margin_bottom,
+    size_t           ksize_width,
+    size_t           ksize_height,
+    int              anchor_x,
+    int              anchor_y,
+    bool             normalize,
+    int              border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_adaptiveThreshold(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          maxValue,
+    int             adaptiveMethod,
+    int             thresholdType,
+    int             blockSize,
+    double          C);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Blurs an image using a Gaussian filter.
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param width            Source image width
+/// @param height           Source image height
+/// @param depth            Depth of source and destination image
+/// @param cn               Number of channels
+/// @param margin_left      Left margins for source image
+/// @param margin_top       Top margins for source image
+/// @param margin_right     Right margins for source image
+/// @param margin_bottom    Bottom margins for source image
+/// @param ksize            Kernel size
+/// @param border_type      Border type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_gaussianBlurBinomial(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    size_t          margin_left,
+    size_t          margin_top,
+    size_t          margin_right,
+    size_t          margin_bottom,
+    size_t          ksize,
+    int             border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Applies a perspective transformation to an image.
+///
+/// @param src_type         Source and destination image type
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param src_width        Source image width
+/// @param src_height       Source image height
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param dst_width        Destination image width
+/// @param dst_height       Destination image height
+/// @param M                3x3 matrix with transform coefficients
+/// @param interpolation    Interpolation mode (CV_HAL_INTER_NEAREST, ...)
+/// @param border_type      Border processing mode (CV_HAL_BORDER_REFLECT, ...)
+/// @param border_value     Values to use for CV_HAL_BORDER_CONSTANT mode
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_warpPerspective(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             dst_width,
+    int             dst_height,
+    const double    M[9],
+    int             interpolation,
+    int             border_type,
+    const double    border_value[4]);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_pyrdown(
+    const uchar*     src_data,
+    size_t           src_step,
+    int              src_width,
+    int              src_height,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              dst_width,
+    int              dst_height,
+    int              depth,
+    int              cn,
+    int              border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_cvtBGRtoHSV(
+    const uchar    * src_data,
+    size_t          src_step,
+    uchar          * dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             scn,
+    bool            swapBlue,
+    bool            isFullRange,
+    bool            isHSV);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_cvtBGRtoYUVApprox(
+    const uchar    * src_data,
+    size_t          src_step,
+    uchar          * dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             scn,
+    bool            swapBlue,
+    bool            isCbCr);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Canny edge detector
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param dst_data Destination image data
+/// @param dst_step Destination image step
+/// @param width Source image width
+/// @param height Source image height
+/// @param cn Number of channels
+/// @param lowThreshold low hresholds value
+/// @param highThreshold high thresholds value
+/// @param ksize Kernel size for Sobel operator.
+/// @param L2gradient Flag, indicating use of L2 or L1 norma.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_canny(
+    const uchar* 	src_data,
+    size_t          src_step,
+    uchar* 			dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             cn,
+    double          lowThreshold,
+    double          highThreshold,
+    int             ksize,
+    bool            L2gradient);
+
+#endif
--- a/3rdparty/fastcv/include/fastcv_hal_utils.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_utils.hpp
@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
+
+#include "fastcv.h"
+#include <opencv2/core/utils/logger.hpp>
+
+#define INITIALIZATION_CHECK                                        \
+{                                                                   \
+    if (!FastCvContext::getContext().isInitialized)                 \
+    {                                                               \
+        return CV_HAL_ERROR_UNKNOWN;                                \
+    }                                                               \
+}
+
+#define CV_HAL_RETURN(status, func)                                         \
+{                                                                           \
+    if( status == FASTCV_SUCCESS )                                          \
+    {                                                                       \
+        CV_LOG_DEBUG(NULL, "FastCV HAL for "<<#func<<" run successfully!"); \
+        return CV_HAL_ERROR_OK;                                             \
+    }                                                                       \
+    else if(status == FASTCV_EBADPARAM || status == FASTCV_EUNALIGNPARAM || \
+            status == FASTCV_EUNSUPPORTED || status == FASTCV_EHWQDSP ||    \
+            status == FASTCV_EHWGPU)                                        \
+    {                                                                       \
+        CV_LOG_DEBUG(NULL, "FastCV status:"<<getFastCVErrorString(status)   \
+            <<", Switching to default OpenCV solution!");                   \
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;                                \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));   \
+        return CV_HAL_ERROR_UNKNOWN;                                        \
+    }                                                                       \
+}
+
+#define CV_HAL_RETURN_NOT_IMPLEMENTED(reason)                           \
+{                                                                       \
+    CV_LOG_DEBUG(NULL,"Switching to default OpenCV\nInfo: "<<reason);   \
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;                                \
+}
+
+#define FCV_KernelSize_SHIFT 3
+#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
+
+const char* getFastCVErrorString(int status);
+const char* borderToString(int border);
+const char* interpolationToString(int interpolation);
+
+struct FastCvContext
+{
+public:
+    // initialize at first call
+    // Defines a static local variable context. Variable is created only once.
+    static FastCvContext& getContext()
+    {
+        static FastCvContext context;
+        return context;
+    }
+
+    FastCvContext()
+    {
+        if (fcvSetOperationMode(FASTCV_OP_CPU_PERFORMANCE) != 0)
+        {
+            CV_LOG_WARNING(NULL, "Failed to switch FastCV operation mode");
+            isInitialized = false;
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "FastCV Operation Mode Switched");
+            isInitialized = true;
+        }
+    }
+
+    bool isInitialized;
+};
+
+#endif
--- a/3rdparty/fastcv/src/fastcv_hal_core.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_core.cpp
@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "fastcv_hal_core.hpp"
+#include "fastcv_hal_utils.hpp"
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/base.hpp>
+
+
+class ParallelTableLookup : public cv::ParallelLoopBody
+{
+public:
+
+    ParallelTableLookup(const uchar* src_data_, int width_, size_t src_step_, const uchar* lut_data_, uchar* dst_data_, size_t dst_step_) :
+        cv::ParallelLoopBody(), src_data(src_data_), width(width_), src_step(src_step_), lut_data(lut_data_), dst_data(dst_data_), dst_step(dst_step_)
+    {
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        fcvStatus status = FASTCV_SUCCESS;
+        for (int y = range.start; y < range.end; y++) {
+            status = fcvTableLookupu8((uint8_t*)src_data + y * src_step, width, 1, src_step, (uint8_t*)lut_data, (uint8_t*)dst_data + y * dst_step, dst_step);
+            if(status != FASTCV_SUCCESS)
+                CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));
+        }
+    }
+
+private:
+    const uchar* src_data;
+    int          width;
+    size_t       src_step;
+    const uchar* lut_data;
+    uchar*       dst_data;
+    size_t       dst_step;
+};
+
+int fastcv_hal_lut(
+    const uchar*    src_data,
+    size_t          src_step,
+    size_t          src_type,
+    const uchar*    lut_data,
+    size_t          lut_channel_size,
+    size_t          lut_channels,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height)
+{
+    if((width*height)<=(320*240))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus           status;
+    if (src_type == CV_8UC1 && lut_channels == 1 && lut_channel_size == 1)
+    {
+        cv::parallel_for_(cv::Range(0, height),
+            ParallelTableLookup(src_data, width, src_step, lut_data, dst_data, dst_step));
+        status = FASTCV_SUCCESS;
+        CV_HAL_RETURN(status, hal_lut);
+    }
+    else
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channel input is not supported");
+    }
+}
+
+int fastcv_hal_normHammingDiff8u(
+    const uchar*    a,
+    const uchar*    b,
+    int             n,
+    int             cellSize,
+    int*            result)
+{
+    fcvStatus           status;
+
+    if (cellSize != 1)
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("NORM_HAMMING2 cellSize:%d is not supported", cellSize));
+
+    INITIALIZATION_CHECK;
+
+    uint32_t dist = 0;
+
+    dist = fcvHammingDistanceu8((uint8_t*)a, (uint8_t*)b, n);
+
+    *result = dist;
+    status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_normHammingDiff8u);
+}
+
+int fastcv_hal_mul8u16u(
+    const uchar*     src1_data,
+    size_t           src1_step,
+    const uchar*     src2_data,
+    size_t           src2_step,
+    ushort*          dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    double           scale)
+{
+    if(scale != 1.0)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (src1_step < (size_t)width && src2_step < (size_t)width)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(ushort);
+    }
+
+    status = fcvElementMultiplyu8u16_v2(src1_data, width, height, src1_step,
+                            src2_data, src2_step, dst_data, dst_step);
+
+    CV_HAL_RETURN(status,hal_multiply);
+}
+
+int fastcv_hal_sub8u32f(
+    const uchar*     src1_data,
+    size_t           src1_step,
+    const uchar*     src2_data,
+    size_t           src2_step,
+    float*           dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height)
+{
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (src1_step < (size_t)width && src2_step < (size_t)width)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(float);
+    }
+
+    status = fcvImageDiffu8f32_v2(src1_data, src2_data, width, height, src1_step,
+                                  src2_step, dst_data, dst_step);
+
+    CV_HAL_RETURN(status,hal_subtract);
+
+}
+
+int fastcv_hal_transpose2d(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              src_width,
+    int              src_height,
+    int              element_size)
+{
+    INITIALIZATION_CHECK;
+
+    if (src_data == dst_data)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("In-place not supported");
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    switch (element_size)
+    {
+        case 1:
+            status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step,
+                                       dst_data, dst_step);
+            break;
+        case 2:
+            status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height,
+                                       src_step, (uint16_t*)dst_data, dst_step);
+            break;
+        case 4:
+            status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height,
+                                       src_step, (float32_t*)dst_data, dst_step);
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("srcType not supported");
+    }
+
+    CV_HAL_RETURN(status,hal_transpose);
+}
+
+int fastcv_hal_meanStdDev(
+    const uchar*      src_data,
+    size_t            src_step,
+    int               width,
+    int               height,
+    int               src_type,
+    double*           mean_val,
+    double*           stddev_val,
+    uchar*            mask,
+    size_t            mask_step)
+{
+    INITIALIZATION_CHECK;
+
+    CV_UNUSED(mask_step);
+
+    if(src_type != CV_8UC1)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
+    }
+    else if(mask != nullptr)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("mask not supported");
+    }
+    else if(mean_val == nullptr && stddev_val == nullptr)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("null ptr for mean and stddev");
+    }
+
+    float32_t mean, variance;
+
+    fcvStatus status = fcvImageIntensityStats_v2(src_data, src_step, 0, 0, width, height,
+                                   &mean, &variance, FASTCV_BIASED_VARIANCE_ESTIMATOR);
+
+    if(mean_val != nullptr)
+        *mean_val = mean;
+    if(stddev_val != nullptr)
+        *stddev_val = std::sqrt(variance);
+
+    CV_HAL_RETURN(status,hal_meanStdDev);
+}
+
+int fastcv_hal_flip(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             flip_mode)
+{
+    INITIALIZATION_CHECK;
+
+    if(src_type!=CV_8UC1 && src_type!=CV_16UC1 && src_type!=CV_8UC3)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Data type is not supported, Switching to default OpenCV solution!");
+
+    if((src_width*src_height)<=(640*480))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
+
+    fcvStatus       status = FASTCV_SUCCESS;;
+    fcvFlipDir      dir;
+
+    switch (flip_mode)
+    {
+        //Flip around X-Axis: Vertical Flip or FLIP_ROWS
+        case 0:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution due to low perf!");
+            dir = FASTCV_FLIP_VERT;
+            break;
+
+        //Flip around Y-Axis: Horizontal Flip or FLIP_COLS
+        case 1:
+            dir = FASTCV_FLIP_HORIZ;
+            break;
+
+        //Flip around both X and Y-Axis or FLIP_BOTH
+        case -1:
+            dir = FASTCV_FLIP_BOTH;
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("Invalid flip_mode, Switching to default OpenCV solution!");
+    }
+
+    if(src_type==CV_8UC1)
+        fcvFlipu8(src_data, src_width, src_height, src_step, dst_data, dst_step, dir);
+    else if(src_type==CV_16UC1)
+        fcvFlipu16((uint16_t*)src_data, src_width, src_height, src_step, (uint16_t*)dst_data, dst_step, dir);
+    else if(src_type==CV_8UC3)
+        status = fcvFlipRGB888u8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, dst_step, dir);
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Data type:%d is not supported, Switching to default OpenCV solution!", src_type));
+
+    CV_HAL_RETURN(status, hal_flip);
+}
+
+int fastcv_hal_rotate(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             angle)
+{
+    if((src_width*src_height)<(120*80))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution for lower resolution!");
+
+    fcvStatus           status;
+    fcvRotateDegree     degree;
+
+    if (src_type != CV_8UC1 && src_type != CV_8UC2)
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
+
+    INITIALIZATION_CHECK;
+
+    switch (angle)
+    {
+        case 90:
+            degree = FASTCV_ROTATE_90;
+            break;
+        case 180:
+            degree = FASTCV_ROTATE_180;
+            break;
+        case 270:
+            degree = FASTCV_ROTATE_270;
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Rotation angle:%d is not supported", angle));
+    }
+
+    switch(src_type)
+    {
+        case CV_8UC1:
+            status = fcvRotateImageu8(src_data, src_width, src_height, src_step, dst_data, dst_step, degree);
+            break;
+        case CV_8UC2:
+            status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data,
+                                                    dst_step, degree);
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
+    }
+    CV_HAL_RETURN(status, hal_rotate);
+}
+
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3])
+{
+    if( (scalars[0] < -128.0f) || (scalars[0] >= 128.0f) ||
+        (scalars[1] < -128.0f) || (scalars[1] >= 128.0f) ||
+        (scalars[2] < -(1<<23))|| (scalars[2] >= 1<<23))
+        CV_HAL_RETURN_NOT_IMPLEMENTED(
+            cv::format("Alpha:%f,Beta:%f,Gamma:%f is not supported because it's too large or too small\n",
+            scalars[0],scalars[1],scalars[2]));
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (height == 1)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(uchar);
+
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+            int rangeWidth = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start;
+            const uint8_t *src2 = src2_data + range.start;
+            uint8_t *dst = dst_data + range.start;
+            fcvAddWeightedu8_v2(src1, rangeWidth, height, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+            int rangeHeight = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start * src1_step;
+            const uint8_t *src2 = src2_data + range.start * src2_step;
+            uint8_t *dst = dst_data + range.start * dst_step;
+            fcvAddWeightedu8_v2(src1, width, rangeHeight, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+
+    CV_HAL_RETURN(status, hal_addWeighted8u_v2);
+}
+
+int fastcv_hal_mul8u(
+    const uchar     *src1_data,
+    size_t          src1_step,
+    const uchar     *src2_data,
+    size_t          src2_step,
+    uchar           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    int8_t sF;
+
+    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
+    else if(scale > 1.0)                    
+    {
+        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
+        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
+        else if(FCV_CMP_EQ(scale,8.0))     { sF = -3; }
+        else if(FCV_CMP_EQ(scale,16.0))    { sF = -4; }
+        else if(FCV_CMP_EQ(scale,32.0))    { sF = -5; }
+        else if(FCV_CMP_EQ(scale,64.0))    { sF = -6; }
+        else if(FCV_CMP_EQ(scale,128.0))   { sF = -7; }
+        else if(FCV_CMP_EQ(scale,256.0))   { sF = -8; }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else if(scale > 0 && scale < 1.0)
+    {
+        if(FCV_CMP_EQ(scale,1/2.0))        { sF = 1;  }
+        else if(FCV_CMP_EQ(scale,1/4.0))   { sF = 2;  }
+        else if(FCV_CMP_EQ(scale,1/8.0))   { sF = 3;  }
+        else if(FCV_CMP_EQ(scale,1/16.0))  { sF = 4;  }
+        else if(FCV_CMP_EQ(scale,1/32.0))  { sF = 5;  }
+        else if(FCV_CMP_EQ(scale,1/64.0))  { sF = 6;  }
+        else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7;  }
+        else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8;  }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const uchar* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const uchar* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      uchar* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplyu8(yS1, rangeWidth, 1, 0, yS2, 0, sF,
+                                            FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const uchar* yS1 =  src1_data + static_cast<size_t>(range.start)*src1_step;
+                      const uchar* yS2 =  src2_data + static_cast<size_t>(range.start)*src2_step;
+                      uchar* yD = dst_data + static_cast<size_t>(range.start)*dst_step;
+                      fcvElementMultiplyu8(yS1, width, rangeHeight, src1_step, yS2, src2_step,
+                                            sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul8u);
+}
+
+int fastcv_hal_mul16s(
+    const short     *src1_data,
+    size_t          src1_step,
+    const short     *src2_data,
+    size_t          src2_step,
+    short           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    int8_t sF;
+
+    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
+    else if(scale > 1.0)                    
+    {
+        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
+        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
+        else if(FCV_CMP_EQ(scale,8.0))     { sF = -3; }
+        else if(FCV_CMP_EQ(scale,16.0))    { sF = -4; }
+        else if(FCV_CMP_EQ(scale,32.0))    { sF = -5; }
+        else if(FCV_CMP_EQ(scale,64.0))    { sF = -6; }
+        else if(FCV_CMP_EQ(scale,128.0))   { sF = -7; }
+        else if(FCV_CMP_EQ(scale,256.0))   { sF = -8; }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else if(scale > 0 && scale < 1.0)
+    {
+        if(FCV_CMP_EQ(scale,1/2.0))        { sF = 1;  }
+        else if(FCV_CMP_EQ(scale,1/4.0))   { sF = 2;  }
+        else if(FCV_CMP_EQ(scale,1/8.0))   { sF = 3;  }
+        else if(FCV_CMP_EQ(scale,1/16.0))  { sF = 4;  }
+        else if(FCV_CMP_EQ(scale,1/32.0))  { sF = 5;  }
+        else if(FCV_CMP_EQ(scale,1/64.0))  { sF = 6;  }
+        else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7;  }
+        else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8;  }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const short* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const short* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      short* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplys16(yS1, rangeWidth, 1, 0, yS2, 0, sF,
+                                             FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const short* yS1 =  src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(short));
+                      const short* yS2 =  src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(short));
+                      short* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(short));
+                      fcvElementMultiplys16(yS1, width, rangeHeight, src1_step, yS2, src2_step,
+                                                sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul16s);
+}
+
+int fastcv_hal_mul32f(
+    const float    *src1_data,
+    size_t          src1_step,
+    const float    *src2_data,
+    size_t          src2_step,
+    float          *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    if(!FCV_CMP_EQ(scale,1.0))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const float* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const float* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      float* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplyf32(yS1, rangeWidth, 1, 0, yS2, 0, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const float* yS1 =  src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(float));
+                      const float* yS2 =  src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(float));
+                      float* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(float));
+                      fcvElementMultiplyf32(yS1, width, rangeHeight, src1_step,
+                                                  yS2, src2_step, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul32f);
+}
--- a/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
--- a/3rdparty/fastcv/src/fastcv_hal_utils.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_utils.cpp
@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "fastcv_hal_utils.hpp"
+
+const char* getFastCVErrorString(int status)
+{
+    switch(status)
+    {
+        case FASTCV_SUCCESS: return "Successful";
+        case FASTCV_EFAIL: return "General failure";
+        case FASTCV_EUNALIGNPARAM: return "Unaligned pointer parameter";
+        case FASTCV_EBADPARAM: return "Bad parameters";
+        case FASTCV_EINVALSTATE: return "Called at invalid state";
+        case FASTCV_ENORES: return "Insufficient resources, memory, thread, etc";
+        case FASTCV_EUNSUPPORTED: return "Unsupported feature";
+        case FASTCV_EHWQDSP: return "Hardware QDSP failed to respond";
+        case FASTCV_EHWGPU: return "Hardware GPU failed to respond";
+        default: return "Unknown FastCV Error";
+    }
+}
+
+const char* borderToString(int border)
+{
+    switch (border)
+    {
+        case 0: return "BORDER_CONSTANT";
+        case 1: return "BORDER_REPLICATE";
+        case 2: return "BORDER_REFLECT";
+        case 3: return "BORDER_WRAP";
+        case 4: return "BORDER_REFLECT_101";
+        case 5: return "BORDER_TRANSPARENT";
+        default: return "Unknown border type";
+    }
+}
+
+const char* interpolationToString(int interpolation)
+{
+    switch (interpolation)
+    {
+        case 0: return "INTER_NEAREST";
+        case 1: return "INTER_LINEAR";
+        case 2: return "INTER_CUBIC";
+        case 3: return "INTER_AREA";
+        case 4: return "INTER_LANCZOS4";
+        case 5: return "INTER_LINEAR_EXACT";
+        case 6: return "INTER_NEAREST_EXACT";
+        case 7: return "INTER_MAX";
+        case 8: return "WARP_FILL_OUTLIERS";
+        case 16: return "WARP_INVERSE_MAP";
+        case 32: return "WARP_RELATIVE_MAP";
+        default: return "Unknown interpolation type";
+    }
+}
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/4.x_20240522
-# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
-ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
+# Binaries branch name: ffmpeg/4.x_20241226
+# Binaries were created for OpenCV: 09892c9d1706f40342bda0bc404580f63492d9f8
+ocv_update(FFMPEG_BINARIES_COMMIT "d63d7c154c57242bf2283be61166be2bd30ec47e")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "642b94d032a8292b07550126934173f6")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "a8c3560c8f20e1ae465bef81580fa92c")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")

 function(download_win_ffmpeg script_var)
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@ -19,4 +19,9 @@
 #include "version/hal_rvv_071.hpp"
 #endif

-#endif
+#if defined(__riscv_v) && __riscv_v == 1000000
+#include "hal_rvv_1p0/merge.hpp" // core
+#include "hal_rvv_1p0/mean.hpp" // core
+#endif
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
@ -0,0 +1,228 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
+
+inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    switch (src_type)
+    {
+    case CV_8UC1:
+        return meanStdDev_8UC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    case CV_8UC4:
+        return meanStdDev_8UC4(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    case CV_32FC1:
+        return meanStdDev_32FC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
+inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m8();
+    vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src_data + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vmask_u8 = __riscv_vle8_v_u8m1(mask_row+j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b8(vmask, vl);
+            }
+        }
+    } else {
+        for (int i = 0; i < height; i++) {
+            const uchar*  src_row = src_data + i * src_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    auto zero = __riscv_vmv_s_x_u64m1(0, vlmax);
+    auto vec_red = __riscv_vmv_v_x_u64m1(0, vlmax);
+    auto vec_reddev = __riscv_vmv_v_x_u64m1(0, vlmax);
+    vec_red = __riscv_vredsum(vec_sum, zero, vlmax);
+    vec_reddev = __riscv_vredsum(vec_sqsum, zero, vlmax);
+    double sum = __riscv_vmv_x(vec_red);
+    double mean = sum / nz;
+    if (mean_val) {
+        *mean_val = mean;
+    }
+    if (stddev_val) {
+        double sqsum = __riscv_vmv_x(vec_reddev);
+        double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
+        double stddev = std::sqrt(variance);
+        *stddev_val = stddev;
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m8();
+    vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src_data + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, jm = 0, vl, vlm;
+            for ( ; j < width*4; j += vl, jm += vlm) {
+                vl = __riscv_vsetvl_e8m1(width*4 - j);
+                vlm = __riscv_vsetvl_e8mf4(width - jm);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vmask_u8mf4 = __riscv_vle8_v_u8mf4(mask_row + jm, vlm);
+                auto vmask_u32 = __riscv_vzext_vf4(vmask_u8mf4, vlm);
+                // 0 -> 0000; 1 -> 1111
+                vmask_u32 = __riscv_vmul(vmask_u32, 0b00000001000000010000000100000001, vlm);
+                auto vmask_u8 = __riscv_vreinterpret_u8m1(vmask_u32);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b8(vmask, vl);
+            }
+        }
+        nz /= 4;
+    } else {
+        for (int i = 0; i < height; i++) {
+            const uchar*  src_row = src_data + i * src_step;
+            int j = 0, vl;
+            for ( ; j <  width*4; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width*4 - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    uint64_t s[256], sq[256], sum[4] = {0}, sqsum[4] = {0};
+    __riscv_vse64(s, vec_sum, vlmax);
+    __riscv_vse64(sq, vec_sqsum, vlmax);
+    for (int i = 0; i < vlmax; ++i)
+    {
+        sum[i % 4] += s[i];
+        sqsum[i % 4] += sq[i];
+    }
+    if (mean_val) {
+        mean_val[0] = (double)sum[0] / nz;
+        mean_val[1] = (double)sum[1] / nz;
+        mean_val[2] = (double)sum[2] / nz;
+        mean_val[3] = (double)sum[3] / nz;
+    }
+    if (stddev_val) {
+        stddev_val[0] = std::sqrt(std::max(((double)sqsum[0] / nz) - (mean_val[0] * mean_val[0]), 0.0));
+        stddev_val[1] = std::sqrt(std::max(((double)sqsum[1] / nz) - (mean_val[1] * mean_val[1]), 0.0));
+        stddev_val[2] = std::sqrt(std::max(((double)sqsum[2] / nz) - (mean_val[2] * mean_val[2]), 0.0));
+        stddev_val[3] = std::sqrt(std::max(((double)sqsum[3] / nz) - (mean_val[3] * mean_val[3]), 0.0));
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m4();
+    vfloat64m4_t vec_sum = __riscv_vfmv_v_f_f64m4(0, vlmax);
+    vfloat64m4_t vec_sqsum = __riscv_vfmv_v_f_f64m4(0, vlmax);
+    src_step /= sizeof(float);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
+                auto vmask_u8 = __riscv_vle8_v_u8mf2(mask_row + j, vl);
+                auto vmask_u32 = __riscv_vzext_vf4(vmask_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u32m2_b16(vmask_u32, 1, vl);
+                vec_sum = __riscv_vfwadd_wv_f64m4_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vfwmacc_vv_f64m4_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b16(vmask, vl);
+            }
+        }
+    } else {
+        for (int i = 0; i < height; i++) {
+            const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
+                vec_sum = __riscv_vfwadd_wv_f64m4_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vfwmacc_vv_f64m4_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    auto zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    auto vec_red = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    auto vec_reddev = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    vec_red = __riscv_vfredusum(vec_sum, zero, vlmax);
+    vec_reddev = __riscv_vfredusum(vec_sqsum, zero, vlmax);
+    double sum = __riscv_vfmv_f(vec_red);
+    double mean = sum / nz;
+    if (mean_val) {
+        *mean_val = mean;
+    }
+    if (stddev_val) {
+        double sqsum = __riscv_vfmv_f(vec_reddev);
+        double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
+        double stddev = std::sqrt(variance);
+        *stddev_val = stddev;
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
@ -0,0 +1,397 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0;
+    int vl = __riscv_vsetvlmax_e8m1();
+    if( k == 1 )
+    {
+        const uchar* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        i = 0;
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[k+i*cn] = src0[i];
+            dst[k+i*cn+1] = src1[i];
+            dst[k+i*cn+2] = src2[i];
+            dst[k+i*cn+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0;
+    int vl = __riscv_vsetvlmax_e16m1();
+    if( k == 1 )
+    {
+        const ushort* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        i = 0;
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
+        }
+        for( ; i < len; i++ )
+        {
+            dst[k+i*cn] = src0[i];
+            dst[k+i*cn+1] = src1[i];
+            dst[k+i*cn+2] = src2[i];
+            dst[k+i*cn+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge32s(const int** src, int* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int64* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+
+#endif
--- a/3rdparty/kleidicv/CMakeLists.txt
+++ b/3rdparty/kleidicv/CMakeLists.txt
@ -1,23 +1,11 @@
 project(kleidicv_hal)

-set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
-ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
-ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
-
-if(KLEIDICV_SOURCE_PATH)
-  set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
-else()
-  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
-                HASH ${KLEIDICV_SRC_HASH}
-                URL
-                  "${OPENCV_KLEIDICV_URL}"
-                  "$ENV{OPENCV_KLEIDICV_URL}"
-                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
-                DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
-                ID KLEIDICV
-                STATUS res
-                UNPACK RELATIVE_URL)
-  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
+if(HAVE_KLEIDICV)
+  option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
+  include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
+  # HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
+  target_compile_options( kleidicv_hal PRIVATE
+      $<TARGET_PROPERTY:kleidicv,COMPILE_OPTIONS>
+      "-Wno-old-style-cast" "-Wno-unused-function"
+  )
 endif()
-
-include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
--- a/3rdparty/kleidicv/kleidicv.cmake
+++ b/3rdparty/kleidicv/kleidicv.cmake
@ -0,0 +1,21 @@
+function(download_kleidicv root_var)
+  set(${root_var} "" PARENT_SCOPE)
+
+  ocv_update(KLEIDICV_SRC_COMMIT "0.3.0")
+  ocv_update(KLEIDICV_SRC_HASH "51a77b0185c2bac2a968a2163869b1ed")
+
+  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
+  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
+                HASH ${KLEIDICV_SRC_HASH}
+                URL
+                  "${OPENCV_KLEIDICV_URL}"
+                  "$ENV{OPENCV_KLEIDICV_URL}"
+                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
+                DESTINATION_DIR ${THE_ROOT}
+                ID KLEIDICV
+                STATUS res
+                UNPACK RELATIVE_URL)
+  if(res)
+    set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
+  endif()
+endfunction()
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@ -5,6 +5,8 @@
 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP

+struct cvhalFilter2D;
+
 namespace cv {

 namespace ndsrvp {
@ -71,6 +73,52 @@ int threshold(const uchar* src_data, size_t src_step,
 #undef cv_hal_threshold
 #define cv_hal_threshold (cv::ndsrvp::threshold)

+// ################ filter ################
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit (cv::ndsrvp::filterInit)
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y);
+
+#undef cv_hal_filter
+#define cv_hal_filter (cv::ndsrvp::filter)
+
+int filterFree(cvhalFilter2D *context);
+
+#undef cv_hal_filterFree
+#define cv_hal_filterFree (cv::ndsrvp::filterFree)
+
+// ################ medianBlur ################
+
+int medianBlur(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn, int ksize);
+
+#undef cv_hal_medianBlur
+#define cv_hal_medianBlur (cv::ndsrvp::medianBlur)
+
+// ################ bilateralFilter ################
+
+int bilateralFilter(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step, int width, int height, int depth,
+    int cn, int d, double sigma_color, double sigma_space, int border_type);
+
+#undef cv_hal_bilateralFilter
+#define cv_hal_bilateralFilter (cv::ndsrvp::bilateralFilter)
+
 } // namespace ndsrvp

 } // namespace cv
--- a/3rdparty/ndsrvp/src/bilateralFilter.cpp
+++ b/3rdparty/ndsrvp/src/bilateralFilter.cpp
@ -0,0 +1,270 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+static void bilateralFilterProcess(uchar* dst_data, size_t dst_step, uchar* pad_data, size_t pad_step,
+    int width, int height, int cn, int radius, int maxk,
+    int* space_ofs, float *space_weight, float *color_weight)
+{
+    int i, j, k;
+
+    for( i = 0; i < height; i++ )
+    {
+        const uchar* sptr = pad_data + (i + radius) * pad_step + radius * cn;
+        uchar* dptr = dst_data + i * dst_step;
+
+        if( cn == 1 )
+        {
+            std::vector<float> buf(width + width, 0.0);
+            float *sum = &buf[0];
+            float *wsum = sum + width;
+            k = 0;
+            for(; k <= maxk-4; k+=4)
+            {
+                const uchar* ksptr0 = sptr + space_ofs[k];
+                const uchar* ksptr1 = sptr + space_ofs[k+1];
+                const uchar* ksptr2 = sptr + space_ofs[k+2];
+                const uchar* ksptr3 = sptr + space_ofs[k+3];
+                j = 0;
+                for (; j < width; j++)
+                {
+                    int rval = sptr[j];
+
+                    int val = ksptr0[j];
+                    float w = space_weight[k] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr1[j];
+                    w = space_weight[k+1] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr2[j];
+                    w = space_weight[k+2] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr3[j];
+                    w = space_weight[k+3] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+                }
+            }
+            for(; k < maxk; k++)
+            {
+                const uchar* ksptr = sptr + space_ofs[k];
+                j = 0;
+                for (; j < width; j++)
+                {
+                    int val = ksptr[j];
+                    float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+                }
+            }
+            j = 0;
+            for (; j < width; j++)
+            {
+                // overflow is not possible here => there is no need to use cv::saturate_cast
+                ndsrvp_assert(fabs(wsum[j]) > 0);
+                dptr[j] = (uchar)(sum[j] / wsum[j] + 0.5);
+            }
+        }
+        else
+        {
+            ndsrvp_assert( cn == 3 );
+            std::vector<float> buf(width * 3 + width);
+            float *sum_b = &buf[0];
+            float *sum_g = sum_b + width;
+            float *sum_r = sum_g + width;
+            float *wsum = sum_r + width;
+            k = 0;
+            for(; k <= maxk-4; k+=4)
+            {
+                const uchar* ksptr0 = sptr + space_ofs[k];
+                const uchar* ksptr1 = sptr + space_ofs[k+1];
+                const uchar* ksptr2 = sptr + space_ofs[k+2];
+                const uchar* ksptr3 = sptr + space_ofs[k+3];
+                const uchar* rsptr = sptr;
+                j = 0;
+                for(; j < width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                {
+                    int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+
+                    int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                    float w = space_weight[k] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                    w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                    w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                    w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+                }
+            }
+            for(; k < maxk; k++)
+            {
+                const uchar* ksptr = sptr + space_ofs[k];
+                const uchar* rsptr = sptr;
+                j = 0;
+                for(; j < width; j++, ksptr += 3, rsptr += 3)
+                {
+                    int b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                    float w = space_weight[k] * color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+                }
+            }
+            j = 0;
+            for(; j < width; j++)
+            {
+                ndsrvp_assert(fabs(wsum[j]) > 0);
+                wsum[j] = 1.f / wsum[j];
+                *(dptr++) = (uchar)(sum_b[j] * wsum[j] + 0.5);
+                *(dptr++) = (uchar)(sum_g[j] * wsum[j] + 0.5);
+                *(dptr++) = (uchar)(sum_r[j] * wsum[j] + 0.5);
+            }
+        }
+    }
+}
+
+int bilateralFilter(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step, int width, int height, int depth,
+    int cn, int d, double sigma_color, double sigma_space, int border_type)
+{
+    if( depth != CV_8U || !(cn == 1 || cn == 3) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int i, j, maxk, radius;
+
+    if( sigma_color <= 0 )
+        sigma_color = 1;
+    if( sigma_space <= 0 )
+        sigma_space = 1;
+
+    double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
+
+    if( d <= 0 )
+        radius = (int)(sigma_space * 1.5 + 0.5);
+    else
+        radius = d / 2;
+
+    radius = MAX(radius, 1);
+    d = radius * 2 + 1;
+
+    // no enough submatrix info
+    // fetch original image data
+    const uchar *ogn_data = src_data;
+    int ogn_step = src_step;
+
+    // ROI fully used in the computation
+    int cal_width = width + d - 1;
+    int cal_height = height + d - 1;
+    int cal_x = 0 - radius; // negative if left border exceeded
+    int cal_y = 0 - radius; // negative if top border exceeded
+
+    // calculate source border
+    std::vector<uchar> padding;
+    padding.resize(cal_width * cal_height * cn);
+    uchar* pad_data = &padding[0];
+    int pad_step = cal_width * cn;
+
+    uchar* pad_ptr;
+    const uchar* ogn_ptr;
+    std::vector<uchar> vec_zeros(cn, 0);
+    for(i = 0; i < cal_height; i++)
+    {
+        int y = borderInterpolate(i + cal_y, height, border_type);
+        if(y < 0) {
+            memset(pad_data + i * pad_step, 0, cn * cal_width);
+            continue;
+        }
+
+        // left border
+        j = 0;
+        for(; j + cal_x < 0; j++)
+        {
+            int x = borderInterpolate(j + cal_x, width, border_type);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cn;
+            pad_ptr = pad_data + i * pad_step + j * cn;
+            memcpy(pad_ptr, ogn_ptr, cn);
+        }
+
+        // center
+        int rborder = MIN(cal_width, width - cal_x);
+        ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cn;
+        pad_ptr = pad_data + i * pad_step + j * cn;
+        memcpy(pad_ptr, ogn_ptr, cn * (rborder - j));
+
+        // right border
+        j = rborder;
+        for(; j < cal_width; j++)
+        {
+            int x = borderInterpolate(j + cal_x, width, border_type);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cn;
+            pad_ptr = pad_data + i * pad_step + j * cn;
+            memcpy(pad_ptr, ogn_ptr, cn);
+        }
+    }
+
+    std::vector<float> _color_weight(cn * 256);
+    std::vector<float> _space_weight(d * d);
+    std::vector<int> _space_ofs(d * d);
+    float* color_weight = &_color_weight[0];
+    float* space_weight = &_space_weight[0];
+    int* space_ofs = &_space_ofs[0];
+
+    // initialize color-related bilateral filter coefficients
+
+    for( i = 0; i < 256 * cn; i++ )
+        color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
+
+    // initialize space-related bilateral filter coefficients
+    for( i = -radius, maxk = 0; i <= radius; i++ )
+    {
+        j = -radius;
+
+        for( ; j <= radius; j++ )
+        {
+            double r = std::sqrt((double)i * i + (double)j * j);
+            if( r > radius )
+                continue;
+            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
+            space_ofs[maxk++] = (int)(i * pad_step + j * cn);
+        }
+    }
+
+    bilateralFilterProcess(dst_data, dst_step, pad_data, pad_step, width, height, cn, radius, maxk, space_ofs, space_weight, color_weight);
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.cpp
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
    return p;
 }

+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
+{
+    int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
+    int16x4_t vone = (int16x4_t){1, 1, 1, 1};
+    int16x4_t vlen = (int16x4_t){len, len, len, len};
+    if(borderType == CV_HAL_BORDER_REPLICATE)
+        vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
+        if(len == 1)
+            return vzero;
+        do
+        {
+            int16x4_t vneg = -vp - 1 + vdelta;
+            int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
+            vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+        }
+        while( (long)(vp >= vlen) || (long)(vp < 0) );
+    }
+    else if(borderType == CV_HAL_BORDER_WRAP)
+    {
+        ndsrvp_assert(len > 0);
+        int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
+        int16x4_t vpos = vp % vlen;
+        vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    }
+    else if(borderType == CV_HAL_BORDER_CONSTANT)
+        vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
+    return vp;
+}
+
 } // namespace ndsrvp

 } // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.hpp
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@ -14,6 +14,7 @@
 #include <iostream>
 #include <string>
 #include <array>
+#include <vector>
 #include <climits>
 #include <algorithm>

@ -26,16 +27,26 @@ namespace ndsrvp {
 void* fastMalloc(size_t size);
 void fastFree(void* ptr);
 int borderInterpolate(int p, int len, int borderType);
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);

 #ifndef MAX
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif

+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)

+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
 #define CV_MALLOC_ALIGN 64

+inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
 // error codes

 enum Error{
@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
    return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
 }

+// expand
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+810 [  0  ] [  1  ] [  4  ] [  5  ]
+832 [  2  ] [  3  ] [  6  ] [  7  ]
+bb  [  0  ] [  1  ] [  2  ] [  3  ]
+tt  [  4  ] [  5  ] [  6  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs810 = __nds__zunpkd810(vs);
+    unsigned long vs832 = __nds__zunpkd832(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+bbbb[      0      ] [      1      ]
+bbtt[      2      ] [      3      ]
+ttbb[      4      ] [      5      ]
+tttt[      6      ] [      7      ]
+*/
+
+
+inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    unsigned long vsbb = __nds__pkbb32(vs831, vs820);
+    unsigned long vstt = __nds__pktt32(vs831, vs820);
+    *(unsigned long*)dst = __nds__pkbb16(0, vsbb);
+    *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
+    *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
+    *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
+}
+
+// float replacement
+
+inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
+{
+    c[0] = a[0] + b[0];
+    c[1] = a[1] + b[1];
+    c[2] = a[2] + b[2];
+    c[3] = a[3] + b[3];
+    c[4] = a[4] + b[4];
+    c[5] = a[5] + b[5];
+    c[6] = a[6] + b[6];
+    c[7] = a[7] + b[7];
+}
+
+/*
+    [1] [8] [23]
+    [24] [8]
+*/
+
+inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
+{
+    const int mask_frac = 0x007FFFFF;
+    const int mask_sign = 0x7FFFFFFF;
+    const int mask_lead = 0x40000000;
+    const int ofs_exp = 23;
+
+    uint32x2_t va01 = *(uint32x2_t*)a;
+    uint32x2_t va23 = *(uint32x2_t*)(a + 2);
+    uint32x2_t va45 = *(uint32x2_t*)(a + 4);
+    uint32x2_t va67 = *(uint32x2_t*)(a + 6);
+
+    uint32x2_t vaexp01 = va01 >> ofs_exp;
+    uint32x2_t vaexp23 = va23 >> ofs_exp;
+    uint32x2_t vaexp45 = va45 >> ofs_exp;
+    uint32x2_t vaexp67 = va67 >> ofs_exp;
+
+    uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
+
+    int16x4_t vb[2]; // fake signed for signed multiply
+    ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
+
+    vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
+    vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
+    vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
+    vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
+
+    uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
+    uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
+    uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
+    uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
+
+    vaexp01 += 8 - vaclz01;
+    vaexp23 += 8 - vaclz23;
+    vaexp45 += 8 - vaclz45;
+    vaexp67 += 8 - vaclz67;
+
+    vafrac01 <<= vaclz01;
+    vafrac23 <<= vaclz23;
+    vafrac45 <<= vaclz45;
+    vafrac67 <<= vaclz67;
+
+    *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
+    *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
+    *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
+    *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
+}
+
 // saturate

 template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v)     { return saturate_cas
 template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
 template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }

+inline double cast_ptr_to_double(const uchar* v, int depth) {
+    switch (depth) {
+        case CV_8U: return (double)*(uchar*)v;
+        case CV_8S: return (double)*(char*)v;
+        case CV_16U: return (double)*(ushort*)v;
+        case CV_16S: return (double)*(short*)v;
+        case CV_32S: return (double)*(int*)v;
+        case CV_32F: return (double)*(float*)v;
+        case CV_64F: return (double)*(double*)v;
+        case CV_16F: return (double)*(float*)v;
+        default: return 0;
+    }
+}
+
+template <typename _Tp>
+inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
+{
+    return ((_Tp*)(data + y * step))[x * cn];
+}
+
 // align

 inline long align(size_t v, int n)
--- a/3rdparty/ndsrvp/src/filter.cpp
+++ b/3rdparty/ndsrvp/src/filter.cpp
@ -0,0 +1,321 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class FilterData
+{
+public:
+    FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType,
+        int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y)
+        : kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType),
+        kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y)
+    {
+    }
+
+    uchar *kernel_data;
+    size_t kernel_step; // bytes between rows(height)
+    int kernel_type, src_type, dst_type, borderType;
+    int kernel_width, kernel_height;
+    int max_width, max_height;
+    double delta;
+    int anchor_x, anchor_y;
+    std::vector<uchar> coords;
+    std::vector<float> coeffs;
+    int nz;
+    std::vector<uchar> padding;
+};
+
+static int countNonZero(const FilterData* ctx)
+{
+    int i, j, nz = 0;
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            if( ((float*)ker_row)[j] != 0.0 )
+                nz++;
+        }
+    }
+    return nz;
+}
+
+static void preprocess2DKernel(FilterData* ctx)
+{
+    int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type;
+    if(nz == 0)
+        nz = 1; // (0, 0) == 0 by default
+    ndsrvp_assert( ktype == CV_32F );
+
+    ctx->coords.resize(nz * 2);
+    ctx->coeffs.resize(nz);
+
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            float val = ((float*)ker_row)[j];
+            if( val == 0.0 )
+                continue;
+            ctx->coords[k * 2] = j;
+            ctx->coords[k * 2 + 1] = i;
+            ctx->coeffs[k++] = val;
+        }
+    }
+
+    ctx->nz = k;
+}
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace)
+{
+    int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type);
+    int cn = CV_MAT_CN(src_type), kdepth = kernel_type;
+
+    (void)allowSubmatrix;
+    (void)allowInplace;
+
+    if(delta - (int)delta != 0.0)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType,
+        kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y);
+
+    *context = (cvhalFilter2D*)ctx;
+
+    ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth);
+
+    preprocess2DKernel(ctx);
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y)
+{
+    FilterData *ctx = (FilterData*)context;
+
+    int cn = CV_MAT_CN(ctx->src_type);
+    int cnes = CV_ELEM_SIZE(ctx->src_type);
+    int ddepth = CV_MAT_DEPTH(ctx->dst_type);
+    float delta_sat = (uchar)(ctx->delta);
+    if(ddepth == CV_8U)
+        delta_sat = (float)saturate_cast<uchar>(ctx->delta);
+    else if(ddepth == CV_16U)
+        delta_sat = (float)saturate_cast<ushort>(ctx->delta);
+
+    // fetch original image data
+    const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes;
+    int ogn_step = src_step;
+
+    // ROI fully used in the computation
+    int cal_width = width + ctx->kernel_width - 1;
+    int cal_height = height + ctx->kernel_height - 1;
+    int cal_x = offset_x - ctx->anchor_x; // negative if left border exceeded
+    int cal_y = offset_y - ctx->anchor_y; // negative if top border exceeded
+
+    // calculate source border
+    ctx->padding.resize(cal_width * cal_height * cnes);
+    uchar* pad_data = &ctx->padding[0];
+    int pad_step = cal_width * cnes;
+
+    uchar* pad_ptr;
+    const uchar* ogn_ptr;
+    std::vector<uchar> vec_zeros(cnes, 0);
+    for(int i = 0; i < cal_height; i++)
+    {
+        int y = borderInterpolate(i + cal_y, full_height, ctx->borderType);
+        if(y < 0) {
+            memset(pad_data + i * pad_step, 0, cnes * cal_width);
+            continue;
+        }
+
+        // left border
+        int j = 0;
+        int16x4_t vj = {0, 1, 2, 3};
+        vj += saturate_cast<short>(cal_x);
+        for(; j + cal_x < -4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j + cal_x < 0; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+
+        // center
+        int rborder = MIN(cal_width, full_width - cal_x);
+        ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes;
+        pad_ptr = pad_data + i * pad_step + j * cnes;
+        memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j));
+
+        // right border
+        j = rborder;
+        vj = (int16x4_t){0, 1, 2, 3} + saturate_cast<short>(cal_x + rborder);
+        for(; j <= cal_width - 4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j < cal_width; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+    }
+
+    // prepare the pointers
+    int i, k, count, nz = ctx->nz;
+    const uchar* ker_pts = &ctx->coords[0];
+    const float* ker_cfs = &ctx->coeffs[0];
+
+    if( ddepth == CV_8U )
+    {
+        std::vector<uchar*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        uchar** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes;
+
+            i = 0;
+            for( ; i <= width * cnes - 8; i += 8 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    // experimental code
+                    // ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs);
+                    // ndsrvp_f32_add8(vs0, vker_cfs, vs0);
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                    vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4];
+                    vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5];
+                    vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6];
+                    vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7];
+                }
+                dst_row[i] = saturate_cast<uchar>(vs0[0]);
+                dst_row[i + 1] = saturate_cast<uchar>(vs0[1]);
+                dst_row[i + 2] = saturate_cast<uchar>(vs0[2]);
+                dst_row[i + 3] = saturate_cast<uchar>(vs0[3]);
+                dst_row[i + 4] = saturate_cast<uchar>(vs0[4]);
+                dst_row[i + 5] = saturate_cast<uchar>(vs0[5]);
+                dst_row[i + 6] = saturate_cast<uchar>(vs0[6]);
+                dst_row[i + 7] = saturate_cast<uchar>(vs0[7]);
+            }
+            for( ; i < width * cnes; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                dst_row[i] = saturate_cast<uchar>(s0);
+            }
+        }
+    }
+    else if( ddepth == CV_16U )
+    {
+        std::vector<ushort*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        ushort** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes);
+
+            i = 0;
+            for( ; i <= width * cn - 4; i += 4 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                }
+                ushort* dst_row_ptr = (ushort*)dst_row;
+                dst_row_ptr[i] = saturate_cast<ushort>(vs0[0]);
+                dst_row_ptr[i + 1] = saturate_cast<ushort>(vs0[1]);
+                dst_row_ptr[i + 2] = saturate_cast<ushort>(vs0[2]);
+                dst_row_ptr[i + 3] = saturate_cast<ushort>(vs0[3]);
+            }
+            for( ; i < width * cn; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                ((ushort*)dst_row)[i] = saturate_cast<ushort>(s0);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filterFree(cvhalFilter2D *context) {
+    FilterData *ctx = (FilterData*)context;
+    delete ctx;
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/medianBlur.cpp
+++ b/3rdparty/ndsrvp/src/medianBlur.cpp
@ -0,0 +1,300 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+struct operators_minmax_t {
+    inline void vector(uint8x8_t & a, uint8x8_t & b) const {
+        uint8x8_t t = a;
+        a = __nds__v_umin8(a, b);
+        b = __nds__v_umax8(t, b);
+    }
+    inline void scalar(uchar & a, uchar & b) const {
+        uchar t = a;
+        a = __nds__umin8(a, b);
+        b = __nds__umax8(t, b);
+    }
+    inline void vector(int8x8_t & a, int8x8_t & b) const {
+        int8x8_t t = a;
+        a = __nds__v_smin8(a, b);
+        b = __nds__v_smax8(t, b);
+    }
+    inline void scalar(schar & a, schar & b) const {
+        schar t = a;
+        a = __nds__smin8(a, b);
+        b = __nds__smax8(t, b);
+    }
+    inline void vector(uint16x4_t & a, uint16x4_t & b) const {
+        uint16x4_t t = a;
+        a = __nds__v_umin16(a, b);
+        b = __nds__v_umax16(t, b);
+    }
+    inline void scalar(ushort & a, ushort & b) const {
+        ushort t = a;
+        a = __nds__umin16(a, b);
+        b = __nds__umax16(t, b);
+    }
+    inline void vector(int16x4_t & a, int16x4_t & b) const {
+        int16x4_t t = a;
+        a = __nds__v_smin16(a, b);
+        b = __nds__v_smax16(t, b);
+    }
+    inline void scalar(short & a, short & b) const {
+        short t = a;
+        a = __nds__smin16(a, b);
+        b = __nds__smax16(t, b);
+    }
+};
+
+template<typename T, typename WT, typename VT> // type, widen type, vector type
+static void
+medianBlur_SortNet( const uchar* src_data, size_t src_step,
+                    uchar* dst_data, size_t dst_step,
+                    int width, int height, int cn, int ksize )
+{
+    const T* src = (T*)src_data;
+    T* dst = (T*)dst_data;
+    int sstep = (int)(src_step / sizeof(T));
+    int dstep = (int)(dst_step / sizeof(T));
+    int i, j, k;
+    operators_minmax_t op;
+
+    if( ksize == 3 )
+    {
+        if( width == 1 || height == 1 )
+        {
+            int len = width + height - 1;
+            int sdelta = height == 1 ? cn : sstep;
+            int sdelta0 = height == 1 ? 0 : sstep - cn;
+            int ddelta = height == 1 ? cn : dstep;
+
+            for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
+                for( j = 0; j < cn; j++, src++ )
+                {
+                    T p0 = src[i > 0 ? -sdelta : 0];
+                    T p1 = src[0];
+                    T p2 = src[i < len - 1 ? sdelta : 0];
+
+                    op.scalar(p0, p1); op.scalar(p1, p2); op.scalar(p0, p1);
+                    dst[j] = (T)p1;
+                }
+            return;
+        }
+
+        width *= cn;
+        for( i = 0; i < height; i++, dst += dstep )
+        {
+            const T* row0 = src + std::max(i - 1, 0)*sstep;
+            const T* row1 = src + i*sstep;
+            const T* row2 = src + std::min(i + 1, height-1)*sstep;
+            int limit = cn;
+
+            for(j = 0;; )
+            {
+                for( ; j < limit; j++ )
+                {
+                    int j0 = j >= cn ? j - cn : j;
+                    int j2 = j < width - cn ? j + cn : j;
+                    T p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
+                    T p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
+                    T p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
+
+                    op.scalar(p1, p2); op.scalar(p4, p5); op.scalar(p7, p8); op.scalar(p0, p1);
+                    op.scalar(p3, p4); op.scalar(p6, p7); op.scalar(p1, p2); op.scalar(p4, p5);
+                    op.scalar(p7, p8); op.scalar(p0, p3); op.scalar(p5, p8); op.scalar(p4, p7);
+                    op.scalar(p3, p6); op.scalar(p1, p4); op.scalar(p2, p5); op.scalar(p4, p7);
+                    op.scalar(p4, p2); op.scalar(p6, p4); op.scalar(p4, p2);
+                    dst[j] = (T)p4;
+                }
+
+                if( limit == width )
+                    break;
+
+                int nlanes = 8 / sizeof(T);
+
+                for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn); j += nlanes ) // alignment
+                {
+                    VT p0 = *(VT*)(row0+j-cn), p1 = *(VT*)(row0+j), p2 = *(VT*)(row0+j+cn);
+                    VT p3 = *(VT*)(row1+j-cn), p4 = *(VT*)(row1+j), p5 = *(VT*)(row1+j+cn);
+                    VT p6 = *(VT*)(row2+j-cn), p7 = *(VT*)(row2+j), p8 = *(VT*)(row2+j+cn);
+
+                    op.vector(p1, p2); op.vector(p4, p5); op.vector(p7, p8); op.vector(p0, p1);
+                    op.vector(p3, p4); op.vector(p6, p7); op.vector(p1, p2); op.vector(p4, p5);
+                    op.vector(p7, p8); op.vector(p0, p3); op.vector(p5, p8); op.vector(p4, p7);
+                    op.vector(p3, p6); op.vector(p1, p4); op.vector(p2, p5); op.vector(p4, p7);
+                    op.vector(p4, p2); op.vector(p6, p4); op.vector(p4, p2);
+                    *(VT*)(dst+j) = p4;
+                }
+
+                limit = width;
+            }
+        }
+    }
+    else if( ksize == 5 )
+    {
+        if( width == 1 || height == 1 )
+        {
+            int len = width + height - 1;
+            int sdelta = height == 1 ? cn : sstep;
+            int sdelta0 = height == 1 ? 0 : sstep - cn;
+            int ddelta = height == 1 ? cn : dstep;
+
+            for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
+                for( j = 0; j < cn; j++, src++ )
+                {
+                    int i1 = i > 0 ? -sdelta : 0;
+                    int i0 = i > 1 ? -sdelta*2 : i1;
+                    int i3 = i < len-1 ? sdelta : 0;
+                    int i4 = i < len-2 ? sdelta*2 : i3;
+                    T p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
+
+                    op.scalar(p0, p1); op.scalar(p3, p4); op.scalar(p2, p3); op.scalar(p3, p4); op.scalar(p0, p2);
+                    op.scalar(p2, p4); op.scalar(p1, p3); op.scalar(p1, p2);
+                    dst[j] = (T)p2;
+                }
+            return;
+        }
+
+        width *= cn;
+        for( i = 0; i < height; i++, dst += dstep )
+        {
+            const T* row[5];
+            row[0] = src + std::max(i - 2, 0)*sstep;
+            row[1] = src + std::max(i - 1, 0)*sstep;
+            row[2] = src + i*sstep;
+            row[3] = src + std::min(i + 1, height-1)*sstep;
+            row[4] = src + std::min(i + 2, height-1)*sstep;
+            int limit = cn*2;
+
+            for(j = 0;; )
+            {
+                for( ; j < limit; j++ )
+                {
+                    T p[25];
+                    int j1 = j >= cn ? j - cn : j;
+                    int j0 = j >= cn*2 ? j - cn*2 : j1;
+                    int j3 = j < width - cn ? j + cn : j;
+                    int j4 = j < width - cn*2 ? j + cn*2 : j3;
+                    for( k = 0; k < 5; k++ )
+                    {
+                        const T* rowk = row[k];
+                        p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
+                        p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
+                        p[k*5+4] = rowk[j4];
+                    }
+
+                    op.scalar(p[1], p[2]); op.scalar(p[0], p[1]); op.scalar(p[1], p[2]); op.scalar(p[4], p[5]); op.scalar(p[3], p[4]);
+                    op.scalar(p[4], p[5]); op.scalar(p[0], p[3]); op.scalar(p[2], p[5]); op.scalar(p[2], p[3]); op.scalar(p[1], p[4]);
+                    op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[7], p[8]); op.scalar(p[6], p[7]); op.scalar(p[7], p[8]);
+                    op.scalar(p[10], p[11]); op.scalar(p[9], p[10]); op.scalar(p[10], p[11]); op.scalar(p[6], p[9]); op.scalar(p[8], p[11]);
+                    op.scalar(p[8], p[9]); op.scalar(p[7], p[10]); op.scalar(p[7], p[8]); op.scalar(p[9], p[10]); op.scalar(p[0], p[6]);
+                    op.scalar(p[4], p[10]); op.scalar(p[4], p[6]); op.scalar(p[2], p[8]); op.scalar(p[2], p[4]); op.scalar(p[6], p[8]);
+                    op.scalar(p[1], p[7]); op.scalar(p[5], p[11]); op.scalar(p[5], p[7]); op.scalar(p[3], p[9]); op.scalar(p[3], p[5]);
+                    op.scalar(p[7], p[9]); op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[5], p[6]); op.scalar(p[7], p[8]);
+                    op.scalar(p[9], p[10]); op.scalar(p[13], p[14]); op.scalar(p[12], p[13]); op.scalar(p[13], p[14]); op.scalar(p[16], p[17]);
+                    op.scalar(p[15], p[16]); op.scalar(p[16], p[17]); op.scalar(p[12], p[15]); op.scalar(p[14], p[17]); op.scalar(p[14], p[15]);
+                    op.scalar(p[13], p[16]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]); op.scalar(p[19], p[20]); op.scalar(p[18], p[19]);
+                    op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[21], p[23]); op.scalar(p[22], p[24]);
+                    op.scalar(p[22], p[23]); op.scalar(p[18], p[21]); op.scalar(p[20], p[23]); op.scalar(p[20], p[21]); op.scalar(p[19], p[22]);
+                    op.scalar(p[22], p[24]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[12], p[18]);
+                    op.scalar(p[16], p[22]); op.scalar(p[16], p[18]); op.scalar(p[14], p[20]); op.scalar(p[20], p[24]); op.scalar(p[14], p[16]);
+                    op.scalar(p[18], p[20]); op.scalar(p[22], p[24]); op.scalar(p[13], p[19]); op.scalar(p[17], p[23]); op.scalar(p[17], p[19]);
+                    op.scalar(p[15], p[21]); op.scalar(p[15], p[17]); op.scalar(p[19], p[21]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]);
+                    op.scalar(p[17], p[18]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[0], p[12]);
+                    op.scalar(p[8], p[20]); op.scalar(p[8], p[12]); op.scalar(p[4], p[16]); op.scalar(p[16], p[24]); op.scalar(p[12], p[16]);
+                    op.scalar(p[2], p[14]); op.scalar(p[10], p[22]); op.scalar(p[10], p[14]); op.scalar(p[6], p[18]); op.scalar(p[6], p[10]);
+                    op.scalar(p[10], p[12]); op.scalar(p[1], p[13]); op.scalar(p[9], p[21]); op.scalar(p[9], p[13]); op.scalar(p[5], p[17]);
+                    op.scalar(p[13], p[17]); op.scalar(p[3], p[15]); op.scalar(p[11], p[23]); op.scalar(p[11], p[15]); op.scalar(p[7], p[19]);
+                    op.scalar(p[7], p[11]); op.scalar(p[11], p[13]); op.scalar(p[11], p[12]);
+                    dst[j] = (T)p[12];
+                }
+
+                if( limit == width )
+                    break;
+
+                int nlanes = 8 / sizeof(T);
+
+                for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn*2); j += nlanes )
+                {
+                    VT p0 = *(VT*)(row[0]+j-cn*2), p5 = *(VT*)(row[1]+j-cn*2), p10 = *(VT*)(row[2]+j-cn*2), p15 = *(VT*)(row[3]+j-cn*2), p20 = *(VT*)(row[4]+j-cn*2);
+                    VT p1 = *(VT*)(row[0]+j-cn*1), p6 = *(VT*)(row[1]+j-cn*1), p11 = *(VT*)(row[2]+j-cn*1), p16 = *(VT*)(row[3]+j-cn*1), p21 = *(VT*)(row[4]+j-cn*1);
+                    VT p2 = *(VT*)(row[0]+j-cn*0), p7 = *(VT*)(row[1]+j-cn*0), p12 = *(VT*)(row[2]+j-cn*0), p17 = *(VT*)(row[3]+j-cn*0), p22 = *(VT*)(row[4]+j-cn*0);
+                    VT p3 = *(VT*)(row[0]+j+cn*1), p8 = *(VT*)(row[1]+j+cn*1), p13 = *(VT*)(row[2]+j+cn*1), p18 = *(VT*)(row[3]+j+cn*1), p23 = *(VT*)(row[4]+j+cn*1);
+                    VT p4 = *(VT*)(row[0]+j+cn*2), p9 = *(VT*)(row[1]+j+cn*2), p14 = *(VT*)(row[2]+j+cn*2), p19 = *(VT*)(row[3]+j+cn*2), p24 = *(VT*)(row[4]+j+cn*2);
+
+                    op.vector(p1, p2); op.vector(p0, p1); op.vector(p1, p2); op.vector(p4, p5); op.vector(p3, p4);
+                    op.vector(p4, p5); op.vector(p0, p3); op.vector(p2, p5); op.vector(p2, p3); op.vector(p1, p4);
+                    op.vector(p1, p2); op.vector(p3, p4); op.vector(p7, p8); op.vector(p6, p7); op.vector(p7, p8);
+                    op.vector(p10, p11); op.vector(p9, p10); op.vector(p10, p11); op.vector(p6, p9); op.vector(p8, p11);
+                    op.vector(p8, p9); op.vector(p7, p10); op.vector(p7, p8); op.vector(p9, p10); op.vector(p0, p6);
+                    op.vector(p4, p10); op.vector(p4, p6); op.vector(p2, p8); op.vector(p2, p4); op.vector(p6, p8);
+                    op.vector(p1, p7); op.vector(p5, p11); op.vector(p5, p7); op.vector(p3, p9); op.vector(p3, p5);
+                    op.vector(p7, p9); op.vector(p1, p2); op.vector(p3, p4); op.vector(p5, p6); op.vector(p7, p8);
+                    op.vector(p9, p10); op.vector(p13, p14); op.vector(p12, p13); op.vector(p13, p14); op.vector(p16, p17);
+                    op.vector(p15, p16); op.vector(p16, p17); op.vector(p12, p15); op.vector(p14, p17); op.vector(p14, p15);
+                    op.vector(p13, p16); op.vector(p13, p14); op.vector(p15, p16); op.vector(p19, p20); op.vector(p18, p19);
+                    op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p21, p23); op.vector(p22, p24);
+                    op.vector(p22, p23); op.vector(p18, p21); op.vector(p20, p23); op.vector(p20, p21); op.vector(p19, p22);
+                    op.vector(p22, p24); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p12, p18);
+                    op.vector(p16, p22); op.vector(p16, p18); op.vector(p14, p20); op.vector(p20, p24); op.vector(p14, p16);
+                    op.vector(p18, p20); op.vector(p22, p24); op.vector(p13, p19); op.vector(p17, p23); op.vector(p17, p19);
+                    op.vector(p15, p21); op.vector(p15, p17); op.vector(p19, p21); op.vector(p13, p14); op.vector(p15, p16);
+                    op.vector(p17, p18); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p0, p12);
+                    op.vector(p8, p20); op.vector(p8, p12); op.vector(p4, p16); op.vector(p16, p24); op.vector(p12, p16);
+                    op.vector(p2, p14); op.vector(p10, p22); op.vector(p10, p14); op.vector(p6, p18); op.vector(p6, p10);
+                    op.vector(p10, p12); op.vector(p1, p13); op.vector(p9, p21); op.vector(p9, p13); op.vector(p5, p17);
+                    op.vector(p13, p17); op.vector(p3, p15); op.vector(p11, p23); op.vector(p11, p15); op.vector(p7, p19);
+                    op.vector(p7, p11); op.vector(p11, p13); op.vector(p11, p12);
+                    *(VT*)(dst+j) = p12;
+                }
+
+                limit = width;
+            }
+        }
+    }
+}
+
+int medianBlur(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn, int ksize)
+{
+    bool useSortNet = ((ksize == 3) || (ksize == 5 && ( depth > CV_8U || cn == 2 || cn > 4 )));
+
+    if( useSortNet )
+    {
+        uchar* src_data_rep;
+        if( dst_data == src_data ) {
+            std::vector<uchar> src_data_copy(src_step * height);
+            memcpy(src_data_copy.data(), src_data, src_step * height);
+            src_data_rep = &src_data_copy[0];
+        }
+        else {
+            src_data_rep = (uchar*)src_data;
+        }
+
+        if( depth == CV_8U )
+            medianBlur_SortNet<uchar, int, uint8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_8S )
+            medianBlur_SortNet<schar, int, int8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_16U )
+            medianBlur_SortNet<ushort, int, uint16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_16S )
+            medianBlur_SortNet<short, int, int16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else 
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+        return CV_HAL_ERROR_OK;
+    }
+    else return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/zlib-ng/CMakeLists.txt
+++ b/3rdparty/zlib-ng/CMakeLists.txt
--- a/3rdparty/zlib-ng/LICENSE.md
+++ b/3rdparty/zlib-ng/LICENSE.md
@ -1,4 +1,4 @@
-(C) 1995-2013 Jean-loup Gailly and Mark Adler
+(C) 1995-2024 Jean-loup Gailly and Mark Adler

 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
--- a/3rdparty/zlib-ng/README.md
+++ b/3rdparty/zlib-ng/README.md
@ -21,7 +21,6 @@ Features
 * Support for CPU intrinsics when available
  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
-  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
  * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@ -95,20 +94,21 @@ make test
 Build Options
 -------------

-| CMake                    | configure                | Description                                                                           | Default |
-|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
-| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
-| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
-| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
-| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
-| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
-| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
-| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
-| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
-| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
-| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
-| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
-| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+| CMake                      | configure                | Description                                                                         | Default |
+|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT                | --zlib-compat            | Compile with zlib compatible API                                                    | OFF     |
+| ZLIB_ENABLE_TESTS          |                          | Build test binaries                                                                 | ON      |
+| WITH_GZFILEOP              | --without-gzfileops      | Compile with support for gzFile related functions                                   | ON      |
+| WITH_OPTIM                 | --without-optimizations  | Build with optimisations                                                            | ON      |
+| WITH_NEW_STRATEGIES        | --without-new-strategies | Use new strategies                                                                  | ON      |
+| WITH_NATIVE_INSTRUCTIONS   |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF     |
+| WITH_RUNTIME_CPU_DETECTION |                          | Compiles with runtime CPU detection                                                 | ON      |
+| WITH_SANITIZER             |                          | Build with sanitizer (memory, address, undefined)                                   | OFF     |
+| WITH_GTEST                 |                          | Build gtest_zlib                                                                    | ON      |
+| WITH_FUZZERS               |                          | Build test/fuzz                                                                     | OFF     |
+| WITH_BENCHMARKS            |                          | Build test/benchmarks                                                               | OFF     |
+| WITH_MAINTAINER_WARNINGS   |                          | Build with project maintainer warnings                                              | OFF     |
+| WITH_CODE_COVERAGE         |                          | Enable code coverage reporting                                                      | OFF     |


 Install
--- a/3rdparty/zlib-ng/adler32.c
+++ b/3rdparty/zlib-ng/adler32.c
@ -7,70 +7,24 @@
 #include "functable.h"
 #include "adler32_p.h"

-/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
-    uint32_t sum2;
-    unsigned n;
-
-    /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
-
-    /* do length NMAX blocks -- requires just one modulo operation */
-    while (len >= NMAX) {
-        len -= NMAX;
-#ifdef UNROLL_MORE
-        n = NMAX / 16;          /* NMAX is divisible by 16 */
-#else
-        n = NMAX / 8;           /* NMAX is divisible by 8 */
-#endif
-        do {
-#ifdef UNROLL_MORE
-            DO16(adler, sum2, buf);          /* 16 sums unrolled */
-            buf += 16;
-#else
-            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
-            buf += 8;
-#endif
-        } while (--n);
-        adler %= BASE;
-        sum2 %= BASE;
-    }
-
-    /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_len_64(adler, buf, len, sum2);
-}
-
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

 /* ========================================================================= */
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

--- a/3rdparty/zlib-ng/adler32_fold.h
+++ b/3rdparty/zlib-ng/adler32_fold.h
@ -1,11 +0,0 @@
-/* adler32_fold.h -- adler32 folding interface
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_FOLD_H_
-#define ADLER32_FOLD_H_
-
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-
-#endif
--- a/3rdparty/zlib-ng/arch/.gitignore
+++ b/3rdparty/zlib-ng/arch/.gitignore
@ -1,2 +0,0 @@
-# ignore Makefiles; they're all automatically generated
-Makefile
--- a/3rdparty/zlib-ng/arch/arm/Makefile.in
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@ -25,7 +25,6 @@ all: \
 	crc32_acle.o crc32_acle.lo \
 	slide_hash_neon.o slide_hash_neon.lo \
 	slide_hash_armv6.o slide_hash_armv6.lo \
-	insert_string_acle.o insert_string_acle.lo

 adler32_neon.o:
 	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@ -69,12 +68,6 @@ slide_hash_armv6.o:
 slide_hash_armv6.lo:
 	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c

-insert_string_acle.o:
-	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
-insert_string_acle.lo:
-	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
--- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@ -7,8 +7,8 @@
 */
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
    static const uint16_t ALIGNED_(16) taps[64] = {
--- a/3rdparty/zlib-ng/arch/arm/arm_features.c
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@ -1,4 +1,4 @@
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "arm_features.h"

 #if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@ -11,6 +11,11 @@
 #  ifndef ID_AA64ISAR0_CRC32_VAL
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
 #elif defined(__APPLE__)
 #  if !defined(_DARWIN_C_SOURCE)
 #    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@ -30,6 +35,16 @@ static int arm_has_crc32() {
 #elif defined(__FreeBSD__) && defined(__aarch64__)
    return getenv("QEMU_EMULATING") == NULL
      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+    int hascrc32 = 0;
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
+          hascrc32 = 1;
+    }
+    return hascrc32;
 #elif defined(__APPLE__)
    int hascrc32;
    size_t size = sizeof(hascrc32);
--- a/3rdparty/zlib-ng/arch/arm/arm_features.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@ -2,8 +2,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef ARM_H_
-#define ARM_H_
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_

 struct arm_cpu_features {
    int has_simd;
@ -13,4 +13,4 @@ struct arm_cpu_features {

 void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);

-#endif /* ARM_H_ */
+#endif /* ARM_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/arm/arm_functions.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_functions.h
@ -0,0 +1,65 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_neon(void);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZLL
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#  endif
+void slide_hash_neon(deflate_state *s);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef ARM_ACLE
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_chunksize
+#    define native_chunksize chunksize_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#    ifdef HAVE_BUILTIN_CTZLL
+#      undef native_compare256
+#      define native_compare256 compare256_neon
+#      undef native_longest_match
+#      define native_longest_match longest_match_neon
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_neon
+#    endif
+#  endif
+// ARM - ACLE
+#  if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
+#    undef native_crc32
+#    define native_crc32 crc32_acle
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@ -4,8 +4,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../generic/chunk_permute_table.h"
+#include "zbuild.h"
+#include "arch/generic/chunk_permute_table.h"

 typedef uint8x16_t chunk_t;

--- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
--- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@ -7,7 +7,7 @@

 #ifdef ARM_ACLE
 #include "acle_intrins.h"
-#include "../../zbuild.h"
+#include "zbuild.h"

 Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
    Z_REGISTER uint32_t c;
--- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
@ -1,24 +0,0 @@
-/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef ARM_ACLE
-#include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val) \
-    h = __crc32w(0, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
-#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
-#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
-
-#include "../../insert_string_tpl.h"
-#endif
--- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@ -25,6 +25,13 @@
    out.val[3] = vqsubq_u16(a.val[3], b); \
 } while (0)

+#  if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
+/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif

 #  ifndef ARM_NEON_HASLD4

--- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@ -5,8 +5,8 @@

 #if defined(ARM_SIMD)
 #include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@ -10,8 +10,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/generic/Makefile.in
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@ -1,5 +1,6 @@
-# Makefile for zlib
+# Makefile for zlib-ng
 # Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
 # For conditions of distribution and use, see copyright notice in zlib.h

 CC=
@ -11,12 +12,62 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all:
+all: \
+ adler32_c.o adler32_c.lo \
+ adler32_fold_c.o adler32_fold_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_fold_c.o crc32_fold_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c


 mostlyclean: clean
 clean:
-	rm -f *.o *.lo *~ \
+	rm -f *.o *.lo *~
 	rm -rf objs
 	rm -f *.gcda *.gcno *.gcov

--- a/3rdparty/zlib-ng/arch/generic/adler32_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_c.c
@ -0,0 +1,54 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
--- a/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
@ -5,12 +5,11 @@

 #include "zbuild.h"
 #include "functable.h"
-#include "adler32_fold.h"

 #include <limits.h>

 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-    adler = functable.adler32(adler, src, len);
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
    memcpy(dst, src, len);
    return adler;
 }
--- a/3rdparty/zlib-ng/arch/generic/chunkset_c.c
+++ b/3rdparty/zlib-ng/arch/generic/chunkset_c.c
--- a/3rdparty/zlib-ng/arch/generic/compare256_c.c
+++ b/3rdparty/zlib-ng/arch/generic/compare256_c.c
@ -5,6 +5,7 @@

 #include "zbuild.h"
 #include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 /* ALIGNED, byte comparison */
--- a/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
@ -8,43 +8,9 @@
 */

 #include "zbuild.h"
-#include "zutil.h"
-#include "functable.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"

-/* ========================================================================= */
-
-const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
-    return (const uint32_t *)crc_table;
-}
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return functable.crc32(crc, buf, len);
-}
-#endif
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
-    return PREFIX(crc32_z)(crc, buf, len);
-}
-#endif
-
-/* ========================================================================= */
-
 /*
  A CRC of a message is computed on N braids of words in the message, where
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
  level. Your mileage may vary.
 */

-/* ========================================================================= */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define ZSWAPWORD(word) (word)
-#  define BRAID_TABLE crc_braid_table
-#elif BYTE_ORDER == BIG_ENDIAN
-#  if W == 8
-#    define ZSWAPWORD(word) ZSWAP64(word)
-#  elif W == 4
-#    define ZSWAPWORD(word) ZSWAP32(word)
-#  endif
-#  define BRAID_TABLE crc_braid_big_table
-#else
-#  error "No endian defined"
-#endif
-#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
 /* ========================================================================= */
 #ifdef W
 /*
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {

 /* ========================================================================= */
 Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
-    Z_REGISTER uint32_t c;
+    uint32_t c;

    /* Pre-condition the CRC */
    c = (~crc) & 0xffffffff;
--- a/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
@ -3,11 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
 #include "zbuild.h"
+#include "zutil.h"
 #include "functable.h"
-
-#include "crc32_fold.h"
-
-#include <limits.h>
+#include "crc32.h"

 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
    crc->value = CRC32_INITIAL_VALUE;
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
 }

 Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
    memcpy(dst, src, len);
 }

@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
     * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
     * init_crc is an unused argument in this context */
    Z_UNUSED(init_crc);
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
 }

 Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
--- a/3rdparty/zlib-ng/arch/generic/generic_functions.h
+++ b/3rdparty/zlib-ng/arch/generic/generic_functions.h
@ -0,0 +1,106 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+
+uint32_t chunksize_c(void);
+uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#endif
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+void     slide_hash_c(deflate_state *s);
+
+uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#    ifdef HAVE_BUILTIN_CTZ
+        uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#    endif
+#    if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+        uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#    ifdef UNALIGNED64_OK
+        uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+
+// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define longest_match_generic longest_match_unaligned_64
+#    define longest_match_slow_generic longest_match_slow_unaligned_64
+#    define compare256_generic compare256_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define longest_match_generic longest_match_unaligned_32
+#    define longest_match_slow_generic longest_match_slow_unaligned_32
+#    define compare256_generic compare256_unaligned_32
+#  else
+#    define longest_match_generic longest_match_unaligned_16
+#    define longest_match_slow_generic longest_match_slow_unaligned_16
+#    define compare256_generic compare256_unaligned_16
+#  endif
+#else
+#  define longest_match_generic longest_match_c
+#  define longest_match_slow_generic longest_match_slow_c
+#  define compare256_generic compare256_c
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_fold_copy adler32_fold_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#  define native_chunksize chunksize_c
+#  define native_crc32 PREFIX(crc32_braid)
+#  define native_crc32_fold crc32_fold_c
+#  define native_crc32_fold_copy crc32_fold_copy_c
+#  define native_crc32_fold_final crc32_fold_final_c
+#  define native_crc32_fold_reset crc32_fold_reset_c
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_generic
+#  define native_longest_match_slow longest_match_slow_generic
+#  define native_compare256 compare256_generic
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
+++ b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
@ -1,6 +1,6 @@
 /* slide_hash.c -- slide hash table C implementation
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

--- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@ -4,7 +4,7 @@

 #ifdef POWER8_VSX
 #include <altivec.h>
-#include "../../zbuild.h"
+#include "zbuild.h"

 typedef vector unsigned char chunk_t;

--- a/3rdparty/zlib-ng/arch/power/compare256_power9.c
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@ -5,8 +5,10 @@

 #ifdef POWER9
 #include <altivec.h>
-#include "../../zbuild.h"
-#include "../../zendian.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "zendian.h"

 /* Older versions of GCC misimplemented semantics for these bit counting builtins.
 * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
--- a/3rdparty/zlib-ng/arch/power/power_features.c
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@ -1,16 +1,19 @@
 /* power_features.c - POWER feature check
 * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
- * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

 #ifdef HAVE_SYS_AUXV_H
 #  include <sys/auxv.h>
 #endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
 #ifdef __FreeBSD__
 #  include <machine/cpu.h>
 #endif
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "power_features.h"

 void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
--- a/3rdparty/zlib-ng/arch/power/power_features.h
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@ -4,8 +4,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef POWER_H_
-#define POWER_H_
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_

 struct power_cpu_features {
    int has_altivec;
@ -15,4 +15,4 @@ struct power_cpu_features {

 void Z_INTERNAL power_check_features(struct power_cpu_features *features);

-#endif /* POWER_H_ */
+#endif /* POWER_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/power/power_functions.h
+++ b/3rdparty/zlib-ng/arch/power/power_functions.h
@ -0,0 +1,67 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_power8(void);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_chunksize
+#    define native_chunksize chunksize_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#  endif
+// Power9
+#  if defined(POWER9) && defined(_ARCH_PWR9)
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@ -9,8 +9,8 @@
 #include <riscv_vector.h>
 #include <stdint.h>

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
    /* split Adler-32 into component sums */
--- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@ -6,7 +6,9 @@

 #ifdef RISCV_RVV

-#include "../../zbuild.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #include <riscv_vector.h>
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@ -1,10 +1,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/auxv.h>
 #include <sys/utsname.h>

-#include "../../zbuild.h"
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#endif
+
+#include "zbuild.h"
 #include "riscv_features.h"

 #define ISA_V_HWCAP (1 << ('v' - 'a'))
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
 }

 void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
    unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+    unsigned long hw_cap = 0;
+#endif
    features->has_rvv = hw_cap & ISA_V_HWCAP;
 }

--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@ -6,8 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef RISCV_H_
-#define RISCV_H_
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_

 struct riscv_cpu_features {
    int has_rvv;
@ -15,4 +15,4 @@ struct riscv_cpu_features {

 void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);

-#endif /* RISCV_H_ */
+#endif /* RISCV_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
@ -0,0 +1,49 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_rvv(void);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+#  if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
+#    undef native_adler32
+#    define native_adler32 adler32_rvv
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_rvv
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_rvv
+#    undef native_chunksize
+#    define native_chunksize chunksize_rvv
+#    undef native_compare256
+#    define native_compare256 compare256_rvv
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_rvv
+#    undef native_longest_match
+#    define native_longest_match longest_match_rvv
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_rvv
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_rvv
+#  endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@ -8,18 +8,16 @@

 #include <riscv_vector.h>

-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
    size_t vl;
    while (entries > 0) {
        vl = __riscv_vsetvl_e16m4(entries);
        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
-        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
-        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
-        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
-        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+        __riscv_vse16_v_u16m4(table, v_diff, vl);
        table += vl, entries -= vl;
    }
 }
--- a/3rdparty/zlib-ng/arch/s390/Makefile.in
+++ b/3rdparty/zlib-ng/arch/s390/Makefile.in
@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/s390/README.md
+++ b/3rdparty/zlib-ng/arch/s390/README.md
@ -0,0 +1,277 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+
+```
+sudo dnf install podman
+```
+
+### Add actions-runner service.
+
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Create a config file, needs github personal access token.
+
+```
+# Create file /etc/actions-runner
+repo=<owner>/<name>
+access_token=<ghp_***>
+```
+
+Access token should have the repo scope, consult
+https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
+for details.
+
+### Autostart actions-runner.
+
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+## Rebuilding the container
+
+In order to update the `gaplib-actions-runner` podman container, e.g. to get the
+latest OS security fixes, follow these steps:
+```
+# Stop actions-runner service
+sudo systemctl stop actions-runner
+
+# Delete old container
+sudo podman container rm gaplib-actions-runner
+
+# Delete old image
+sudo podman image rm localhost/zlib-ng/actions-runner
+
+# Build image
+sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
+
+# Build container
+sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
+
+# Start actions-runner service
+sudo systemctl start actions-runner
+```
--- a/3rdparty/zlib-ng/arch/s390/crc32-vx.c
+++ b/3rdparty/zlib-ng/arch/s390/crc32-vx.c
@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return PREFIX(crc32_braid)(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = PREFIX(crc32_braid)(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
+
+    return crc;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+} ALIGNED_(8);
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+   Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+typedef struct {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+    struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+   History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+   Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
@ -0,0 +1,383 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
+    PREFIX(flush_pending)(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(state->crc_fold.value);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        state->crc_fold.value = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
@ -0,0 +1,275 @@
+#include "zbuild.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+    *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+    *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    instrument_read_write(hist + param->ho, hl_high);
+    instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    __msan_unpoison(hist + param->ho, hl_high);
+    __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+    unsigned char *orig_t2 = t2;
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0");
+    Z_REGISTER void *r1 __asm__("r1");
+    Z_REGISTER unsigned char *r2 __asm__("r2");
+    Z_REGISTER size_t r3 __asm__("r3");
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+    Z_REGISTER size_t r5 __asm__("r5");
+    int cc;
+
+    /* Insert pre-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        instrument_write(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+        instrument_read(t4, t5);
+        break;
+    case DFLTCC_CMPR:
+    case DFLTCC_XPND:
+        instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        instrument_read(t4, t5);
+        instrument_write(t2, t3);
+        instrument_read_write_hist(param, hist);
+        break;
+    }
+
+    r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+    /* Insert post-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        msan_unpoison_hist(param, hist);
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        msan_unpoison_hist(param, hist);
+        break;
+    }
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    memcpy(buf, history + param->ho, hl_high);
+    memcpy(buf + hl_high, history, hl_low);
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
@ -0,0 +1,191 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    return !state->arch.common.param.nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/s390_features.c
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.c
@ -0,0 +1,14 @@
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS HWCAP_S390_VX
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
--- a/3rdparty/zlib-ng/arch/s390/s390_features.h
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.h
@ -0,0 +1,14 @@
+/* s390_features.h -- check for s390 features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+    int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/s390_functions.h
+++ b/3rdparty/zlib-ng/arch/s390/s390_functions.h
@ -0,0 +1,20 @@
+/* s390_functions.h -- s390 implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FUNCTIONS_H_
+#define S390_FUNCTIONS_H_
+
+#ifdef S390_CRC32_VX
+uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+#  if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+#    undef native_crc32
+#    define native_crc32 = crc32_s390_vx
+#  endif
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@ -0,0 +1,47 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+FROM    almalinux:9
+
+RUN     dnf update -y -q && \
+        dnf install -y -q --enablerepo=crb wget git which sudo jq \
+            cmake make automake autoconf m4 libtool ninja-build python3-pip \
+            gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
+            glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
+
+RUN     dnf install -y -q dotnet-sdk-6.0 && \
+        echo "Using SDK - `dotnet --version`"
+
+COPY    runner-s390x.patch /tmp/runner.patch
+COPY    runner-global.json /tmp/global.json
+
+RUN     cd /tmp && \
+        git clone -q https://github.com/actions/runner && \
+        cd runner && \
+        git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
+        git apply /tmp/runner.patch && \
+        cp -f /tmp/global.json src/global.json
+
+
+RUN     cd /tmp/runner/src && \
+        ./dev.sh layout && \
+        ./dev.sh package && \
+        rm -rf /root/.dotnet /root/.nuget
+
+RUN     useradd -c "Action Runner" -m actions-runner && \
+        usermod -L actions-runner
+
+RUN     tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
+        chown -R actions-runner:actions-runner /home/actions-runner
+
+#VOLUME  /home/actions-runner
+
+RUN     rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
+        dnf clean all
+
+USER    actions-runner
+
+# Scripts.
+COPY    fs/ /
+WORKDIR /home/actions-runner
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD     ["/usr/bin/actions-runner"]
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
@ -0,0 +1,18 @@
+[Unit]
+Description=Podman container: Gaplib Github Actions Runner
+Wants=network-online.target
+After=network-online.target
+StartLimitIntervalSec=1
+RequiresMountsFor=/run/user/1001/containers
+
+[Service]
+Environment=PODMAN_SYSTEMD_UNIT=%n
+Restart=always
+TimeoutStopSec=61
+ExecStart=/usr/bin/podman start gaplib-actions-runner
+ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
+ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
+Type=forking
+
+[Install]
+WantedBy=default.target
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
@ -0,0 +1,5 @@
+{
+  "sdk": {
+    "version": "6.0.421"
+  }
+}
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
@ -0,0 +1,243 @@
+diff --git a/src/Directory.Build.props b/src/Directory.Build.props
+index 9db5fac..f02e235 100644
+--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
+@@ -44,6 +44,9 @@
+   <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
+     <DefineConstants>$(DefineConstants);ARM64</DefineConstants>
+   </PropertyGroup>
+  <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
+    <DefineConstants>$(DefineConstants);S390X</DefineConstants>
+  </PropertyGroup>
+ 
+   <!-- Set TRACE/DEBUG vars -->
+   <PropertyGroup>
+diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
+index 383221e..1555f67 100755
+--- a/src/Misc/externals.sh
+++ b/src/Misc/externals.sh
+@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
+     acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
+     acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
+ fi
+
+if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
+    acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
+    acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
+fi
+diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
+index 14cc6ba..9b5b8e6 100755
+--- a/src/Misc/layoutroot/config.sh
+++ b/src/Misc/layoutroot/config.sh
+@@ -20,25 +20,29 @@ then
+ 
+     message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
+ 
+-    ldd ./bin/libcoreclr.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
+    ARCH=`uname -m`
+    if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
+    then
+        ldd ./bin/libcoreclr.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+ 
+-    ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
+        ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+ 
+-    ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+        ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+     fi
+ 
+     if ! [ -x "$(command -v ldconfig)" ]; then
+diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
+index 177e3c9..9545981 100644
+--- a/src/Runner.Common/Constants.cs
+++ b/src/Runner.Common/Constants.cs
+@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
+             X86,
+             X64,
+             Arm,
+-            Arm64
+            Arm64,
+	    S390x
+         }
+ 
+         public static class Runner
+@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm;
+ #elif ARM64
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
+#elif S390X
+            public static readonly Architecture PlatformArchitecture = Architecture.S390x;
+ #else
+             public static readonly Architecture PlatformArchitecture = Architecture.X64;
+ #endif
+diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
+index 97273a1..2a34430 100644
+--- a/src/Runner.Common/Util/VarUtil.cs
+++ b/src/Runner.Common/Util/VarUtil.cs
+@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
+                         return "ARM";
+                     case Constants.Architecture.Arm64:
+                         return "ARM64";
+                    case Constants.Architecture.S390x:
+                        return "S390X";
+                     default:
+                         throw new NotSupportedException(); // Should never reach here.
+                 }
+diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
+index 2042485..a9d8b46 100644
+--- a/src/Test/L0/ConstantGenerationL0.cs
+++ b/src/Test/L0/ConstantGenerationL0.cs
+@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
+                 "linux-x64",
+                 "linux-arm",
+                 "linux-arm64",
+                "linux-s390x",
+                 "osx-x64",
+                 "osx-arm64"
+             };
+diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
+index 26ba65e..6791df3 100644
+--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
+@@ -1,4 +1,4 @@
+-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+@@ -16,6 +16,7 @@ using Xunit;
+ 
+ namespace GitHub.Runner.Common.Tests.Listener
+ {
+#if !S390X // Self-update is not currently supported on S390X
+     public sealed class SelfUpdaterL0
+     {
+         private Mock<IRunnerServer> _runnerServer;
+@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
+             }
+         }
+     }
+#endif
+ }
+ #endif
+diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+index 5115a6b..dd8d198 100644
+--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+@@ -1,4 +1,4 @@
+-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
+index f6b5889..26f8e21 100644
+--- a/src/Test/L0/Worker/StepHostL0.cs
+++ b/src/Test/L0/Worker/StepHostL0.cs
+@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
+             return hc;
+         }
+ 
+-#if OS_LINUX
+#if OS_LINUX && !S390X
+         [Fact]
+         [Trait("Level", "L0")]
+         [Trait("Category", "Worker")]
+diff --git a/src/dev.sh b/src/dev.sh
+index fa637d1..8c66f37 100755
+--- a/src/dev.sh
+++ b/src/dev.sh
+@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+         case $CPU_NAME in
+             armv7l) RUNTIME_ID="linux-arm";;
+             aarch64) RUNTIME_ID="linux-arm64";;
+            s390x) RUNTIME_ID="linux-s390x";;
+         esac
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
+@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
+         exit 1
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+-    if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
+	if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm')  && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
+        echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
+        exit 1
+     fi
+@@ -199,7 +200,8 @@ function package ()
+     popd > /dev/null
+ }
+ 
+-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
+if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
+
+ 
+     # Download dotnet SDK to ../_dotnetsdk directory
+     heading "Ensure Dotnet SDK"
+@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
+     echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
+ fi
+ 
+-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
+    echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+    export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+fi
+ 
+ heading "Dotnet SDK Version"
+ dotnet --version
+diff --git a/src/dir.proj b/src/dir.proj
+index 056a312..8370922 100644
+--- a/src/dir.proj
+++ b/src/dir.proj
+@@ -41,8 +41,18 @@
+     </ItemGroup>
+ 
+     <Target Name="Build" DependsOnTargets="GenerateConstant">
+-        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
+-        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+        <PropertyGroup>
+            <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
+            <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
+            <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
+                 Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
+                 In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
+                 as this would confuse the expected layout, disable that behavior as well.  -->
+            <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
+        </PropertyGroup>
+
+        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
+        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+         <Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
+     </Target>
+
--- a/3rdparty/zlib-ng/arch/x86/Makefile.in
+++ b/3rdparty/zlib-ng/arch/x86/Makefile.in
@ -35,7 +35,6 @@ all: \
 	chunkset_ssse3.o chunkset_ssse3.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_sse2.o compare256_sse2.lo \
-	insert_string_sse42.o insert_string_sse42.lo \
 	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
 	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
 	slide_hash_avx2.o slide_hash_avx2.lo \
@ -77,12 +76,6 @@ compare256_sse2.o:
 compare256_sse2.lo:
 	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c

-insert_string_sse42.o:
-	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
-insert_string_sse42.lo:
-	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
 crc32_pclmulqdq.o:
 	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c

@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
 	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c

 crc32_vpclmulqdq.o:
-	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c

 crc32_vpclmulqdq.lo:
-	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c

 slide_hash_avx2.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
@ -9,24 +9,15 @@

 #ifdef X86_AVX2

-#include "../../zbuild.h"
+#include "zbuild.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
+#include "adler32_p.h"
 #include "adler32_avx2_p.h"
 #include "x86_intrins.h"

-#ifdef X86_SSE42
 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);

-#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
-#define sub32(a, b, c) adler32_ssse3(a, b, c)
-#else
-#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
-#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
-#endif
-
 static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
    if (src == NULL) return 1L;
    if (len == 0) return adler;
@ -44,9 +35,9 @@ rem_peel:
        }
    } else if (len < 32) {
        if (COPY) {
-            return copy_sub32(adler, dst, src, len);
+            return adler32_fold_copy_sse42(adler, dst, src, len);
        } else {
-            return sub32(adler, src, len);
+            return adler32_ssse3(adler, src, len);
        }
    }

--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
@ -8,10 +8,9 @@

 #ifdef X86_AVX512

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
@ -33,13 +32,7 @@ rem_peel:
            _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
        }

-#ifdef X86_AVX2
        return adler32_avx2(adler, src, len);
-#elif defined(X86_SSSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
    }

    __m512i vbuf, vs1_0, vs3;
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
@ -9,11 +9,10 @@

 #ifdef X86_AVX512VNNI

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
 #include "adler32_avx2_p.h"
@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size

 rem_peel:
    if (len < 32)
-#if defined(X86_SSSE3)
        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif

    if (len < 64)
-#ifdef X86_AVX2
        return adler32_avx2(adler, src, len);
-#elif defined(X86_SSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif

    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@ -135,11 +124,7 @@ rem_peel_copy:
        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);

-#if defined(X86_SSSE3)
        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
    }

    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
--- a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
@ -6,9 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"
 #include <immintrin.h>

--- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
@ -6,8 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"

 #ifdef X86_SSSE3
--- a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
@ -4,10 +4,7 @@

 #include "zbuild.h"

-/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
- * code size by sharing the chunkcopy functions, which will certainly compile
- * to identical machine code */
-#if defined(X86_SSSE3) && defined(X86_SSE2)
+#if defined(X86_SSSE3)
 #include <immintrin.h>
 #include "../generic/chunk_permute_table.h"

@ -19,8 +16,6 @@ typedef __m128i chunk_t;
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 #define HAVE_CHUNK_MAG
-#define HAVE_CHUNKCOPY
-#define HAVE_CHUNKUNROLL

 static const lut_rem_pair perm_idx_lut[13] = {
    {0, 1},      /* 3 */
@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
    return ret_vec;
 }

-extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-
 #define CHUNKSIZE        chunksize_ssse3
 #define CHUNKMEMSET      chunkmemset_ssse3
 #define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
-#define CHUNKCOPY        chunkcopy_sse2
-#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKCOPY        chunkcopy_ssse3
+#define CHUNKUNROLL      chunkunroll_ssse3

 #include "chunkset_tpl.h"

--- a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
--- a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
--- a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
    __m128i xmm_crc_part = _mm_setzero_si128();
-#ifdef COPY
    char ALIGNED_(16) partial_buf[16] = { 0 };
-#else
+#ifndef COPY
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
    int32_t first = init_crc != 0;

-    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
-     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
-     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
-     * by definition can be up to 15 bytes + one full vector load. */
-    assert(len >= 31 || first == 0);
+    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
+     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
+     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
+     * up to 15 bytes + one full vector load. */
+    assert(len >= 16 || first == 0);
 #endif
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

    if (len < 16) {
-#ifdef COPY
        if (len == 0)
            return;

        memcpy(partial_buf, src, len);
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+#ifdef COPY
        memcpy(dst, partial_buf, len);
 #endif
        goto partial;
@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint

        if (algn_diff < 4 && init_crc != 0) {
            xmm_t0 = xmm_crc_part;
-            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
-            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
-            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            if (len >= 32) {
+                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            } else {
+                memcpy(partial_buf, src + 16, len - 16);
+                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+                src += 16;
+                len -= 16;
+#ifdef COPY
+                dst -= algn_diff;
+#endif
+                goto partial;
+            }
+
            src += 16;
            len -= 16;
        }
--- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@ -17,7 +17,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
+#include "zbuild.h"

 #include <immintrin.h>
 #include <wmmintrin.h>
@ -26,8 +26,9 @@
 #  include <immintrin.h>
 #endif

-#include "../../crc32_fold.h"
-#include "../../crc32_braid_p.h"
+#include "crc32.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
 #include "x86_intrins.h"
 #include <assert.h>

@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
    return crc->value;
 }

+static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c = (~crc) & 0xffffffff;
+
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    return c ^ 0xffffffff;
+}
+
 Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
-    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
-     * these short lengths might also prove to be effective */
-    if (len < 64)
-        return PREFIX(crc32_braid)(crc32, buf, len);
+    /* For lens smaller than ~12, crc32_small method is faster.
+     * But there are also minimum requirements for the pclmul functions due to alignment */
+    if (len < 16)
+        return crc32_small(crc32, buf, len);

    crc32_fold ALIGNED_(16) crc_state;
    CRC32_FOLD_RESET(&crc_state);
--- a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
+++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@ -3,7 +3,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+#ifdef X86_VPCLMULQDQ_CRC

 #define X86_VPCLMULQDQ
 #define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
--- a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
@ -1,24 +0,0 @@
-/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef X86_SSE42
-#include "../../zbuild.h"
-#include <nmmintrin.h>
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val)\
-    h = _mm_crc32_u32(h, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         update_hash_sse42
-#define INSERT_STRING       insert_string_sse42
-#define QUICK_INSERT_STRING quick_insert_string_sse42
-
-#include "../../insert_string_tpl.h"
-#endif
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
@ -9,8 +9,8 @@
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 #include <immintrin.h>

--- a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
@ -8,8 +8,8 @@
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 #include <immintrin.h>
 #include <assert.h>
--- a/3rdparty/zlib-ng/arch/x86/x86_features.c
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.c
@ -7,7 +7,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "x86_features.h"

 #ifdef _MSC_VER
@ -15,6 +15,13 @@
 #else
 // Newer versions of GCC and clang come with cpuid.h
 #  include <cpuid.h>
+#  ifdef X86_HAVE_XSAVE_INTRIN
+#    if __GNUC__ == 8
+#      include <xsaveintrin.h>
+#    else
+#      include <immintrin.h>
+#    endif
+#  endif
 #endif

 #include <string.h>
@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
    *ecx = registers[2];
    *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
    __cpuid(info, *eax, *ebx, *ecx, *edx);
 #endif
 }
@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
    *ecx = registers[2];
    *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
 #endif
 }

 static inline uint64_t xgetbv(unsigned int xcr) {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
    return _xgetbv(xcr);
 #else
    uint32_t eax, edx;
@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {

        // check AVX512 bits if the OS supports saving ZMM registers
        if (features->has_os_save_zmm) {
-            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512f = ebx & 0x00010000;
+            if (features->has_avx512f) {
+                // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+                // AVX512(DQ,BW,VL).
+                features->has_avx512dq = ebx & 0x00020000;
+                features->has_avx512bw = ebx & 0x40000000;
+                features->has_avx512vl = ebx & 0x80000000;
+            }
+            features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+              && features->has_avx512vl;
            features->has_avx512vnni = ecx & 0x800;
        }
    }
--- a/3rdparty/zlib-ng/arch/x86/x86_features.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.h
@ -1,14 +1,18 @@
 /* x86_features.h -- check for CPU features
-* Copyright (C) 2013 Intel Corporation Jim Kukunas
-* For conditions of distribution and use, see copyright notice in zlib.h
-*/
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */

 #ifndef X86_FEATURES_H_
 #define X86_FEATURES_H_

 struct x86_cpu_features {
    int has_avx2;
-    int has_avx512;
+    int has_avx512f;
+    int has_avx512dq;
+    int has_avx512bw;
+    int has_avx512vl;
+    int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
    int has_avx512vnni;
    int has_sse2;
    int has_ssse3;
@ -21,4 +25,4 @@ struct x86_cpu_features {

 void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);

-#endif /* CPU_H_ */
+#endif /* X86_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/x86/x86_functions.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_functions.h
@ -0,0 +1,172 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#ifdef X86_SSE2
+uint32_t chunksize_sse2(void);
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+    void slide_hash_sse2(deflate_state *s);
+#  endif
+    void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx2(void);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+    void slide_hash_avx2(deflate_state *s);
+#  endif
+    void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_CRC
+uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+#  if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_sse2
+#    undef native_chunksize
+#    define native_chunksize chunksize_sse2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_sse2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_sse2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_sse2
+#      undef native_longest_match
+#      define native_longest_match longest_match_sse2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_sse2
+#    endif
+#endif
+// X86 - SSSE3
+#  if defined(X86_SSSE3) && defined(__SSSE3__)
+#    undef native_adler32
+#    define native_adler32 adler32_ssse3
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_ssse3
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_ssse3
+#  endif
+// X86 - SSE4.2
+#  if defined(X86_SSE42) && defined(__SSE4_2__)
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_sse42
+#  endif
+
+// X86 - PCLMUL
+#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
+#  undef native_crc32
+#  define native_crc32 crc32_pclmulqdq
+#  undef native_crc32_fold
+#  define native_crc32_fold crc32_fold_pclmulqdq
+#  undef native_crc32_fold_copy
+#  define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
+#  undef native_crc32_fold_final
+#  define native_crc32_fold_final crc32_fold_pclmulqdq_final
+#  undef native_crc32_fold_reset
+#  define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
+#endif
+// X86 - AVX
+#  if defined(X86_AVX2) && defined(__AVX2__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx2
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx2
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx2
+#    undef native_chunksize
+#    define native_chunksize chunksize_avx2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_avx2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_avx2
+#      undef native_longest_match
+#      define native_longest_match longest_match_avx2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_avx2
+#    endif
+#  endif
+
+// X86 - AVX512 (F,DQ,BW,Vl)
+#  if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx512
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx512
+// X86 - AVX512 (VNNI)
+#    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
+#      undef native_adler32
+#      define native_adler32 adler32_avx512_vnni
+#      undef native_adler32_fold_copy
+#      define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
+#    endif
+// X86 - VPCLMULQDQ
+#    if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
+#      undef native_crc32
+#      define native_crc32 crc32_vpclmulqdq
+#      undef native_crc32_fold
+#      define native_crc32_fold crc32_fold_vpclmulqdq
+#      undef native_crc32_fold_copy
+#      define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
+#      undef native_crc32_fold_final
+#      define native_crc32_fold_final crc32_fold_vpclmulqdq_final
+#      undef native_crc32_fold_reset
+#      define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
+#    endif
+#  endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/x86/x86_intrins.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
@ -7,7 +7,7 @@
 #ifdef __AVX2__
 #include <immintrin.h>

-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
    || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
 static inline __m256i _mm256_zextsi128_si256(__m128i a) {
    __m128i r;
@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 /* GCC <9 is missing some AVX512 intrinsics.
 */
 #ifdef __AVX512F__
-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
 #include <immintrin.h>

 #define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
--- a/3rdparty/zlib-ng/arch_functions.h
+++ b/3rdparty/zlib-ng/arch_functions.h
@ -0,0 +1,29 @@
+/* arch_functions.h -- Arch-specific function prototypes.
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FUNCTIONS_H_
+#define CPU_FUNCTIONS_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "crc32.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#include "arch/generic/generic_functions.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_functions.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_functions.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_functions.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_functions.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_functions.h"
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/chunkset_tpl.h
+++ b/3rdparty/zlib-ng/chunkset_tpl.h
@ -5,7 +5,7 @@
 #include "zbuild.h"
 #include <stdlib.h>

-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
 extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
 #endif

@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
   without iteration, which will hopefully make the branch prediction more
   reliable. */
 #ifndef HAVE_CHUNKCOPY
-Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    Assert(len > 0, "chunkcopy should never have a length 0");
    chunk_t chunk;
    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
   least 258 bytes of output space available (258 being the maximum length
   output from a single token; see inflate_fast()'s assumptions below). */
 #ifndef HAVE_CHUNKUNROLL
-Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
    unsigned char const *from = out - *dist;
    chunk_t chunk;
    while (*dist < *len && *dist < sizeof(chunk_t)) {
@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
    Assert(dist > 0, "chunkmemset cannot have a distance 0");
    /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
    if (len <= 16) {
        return chunkmemset_ssse3(out, dist, len);
    }
--- a/3rdparty/zlib-ng/cmake/detect-arch.c
+++ b/3rdparty/zlib-ng/cmake/detect-arch.c
@ -0,0 +1,115 @@
+// archdetect.c -- Detect compiler architecture and raise preprocessor error
+//                 containing a simple arch identifier.
+// Copyright (C) 2019 Hans Kristian Rosbach
+// Licensed under the Zlib license, see LICENSE.md for details
+
+// x86_64
+#if defined(__x86_64__) || defined(_M_X64)
+    #error archfound x86_64
+
+// x86
+#elif defined(__i386) || defined(_M_IX86)
+    #error archfound i686
+
+// ARM
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #error archfound aarch64
+#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
+        #error archfound armv8
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+        #error archfound armv7
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
+        #error archfound armv6
+    #elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+        #error archfound armv5
+    #elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
+        #error archfound armv4
+    #elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
+        #error archfound armv3
+    #elif defined(__ARM_ARCH_2__)
+        #error archfound armv2
+    #endif
+
+// PowerPC
+#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
+    #if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #error archfound powerpc64le
+        #else
+            #error archfound powerpc64
+        #endif
+    #else
+        #error archfound powerpc
+    #endif
+
+// --------------- Less common architectures alphabetically below ---------------
+
+// ALPHA
+#elif defined(__alpha__) || defined(__alpha)
+    #error archfound alpha
+
+// Blackfin
+#elif defined(__BFIN__)
+    #error archfound blackfin
+
+// Itanium
+#elif defined(__ia64) || defined(_M_IA64)
+    #error archfound ia64
+
+// MIPS
+#elif defined(__mips__) || defined(__mips)
+    #error archfound mips
+
+// Motorola 68000-series
+#elif defined(__m68k__)
+    #error archfound m68k
+
+// SuperH
+#elif defined(__sh__)
+    #error archfound sh
+
+// SPARC
+#elif defined(__sparc__) || defined(__sparc)
+    #if defined(__sparcv9) || defined(__sparc_v9__)
+        #error archfound sparc9
+    #elif defined(__sparcv8) || defined(__sparc_v8__)
+        #error archfound sparc8
+    #endif
+
+// SystemZ
+#elif defined(__370__)
+    #error archfound s370
+#elif defined(__s390__)
+    #error archfound s390
+#elif defined(__s390x) || defined(__zarch__)
+    #error archfound s390x
+
+// PARISC
+#elif defined(__hppa__)
+    #error archfound parisc
+
+// RS-6000
+#elif defined(__THW_RS6000)
+    #error archfound rs6000
+
+// RISC-V
+#elif defined(__riscv)
+    #if __riscv_xlen == 64
+        #error archfound riscv64
+    #elif __riscv_xlen == 32
+        #error archfound riscv32
+    #endif
+
+// LOONGARCH
+#elif defined(__loongarch_lp64)
+    #error archfound loongarch64
+
+// Emscripten (WebAssembly)
+#elif defined(__EMSCRIPTEN__)
+    #error archfound wasm32
+
+// return 'unrecognized' if we do not know what architecture this is
+#else
+    #error archfound unrecognized
+#endif
--- a/Show More
+++ b/Show More